order_remove_immediate() -- add a remove order for the job task
int order_remove_immediate(lListElem *job, lListElem *ja_task,
Generates an order of type ORT_remove_immediate_job for the given job task.
lListElem *job - The job to remove (JB_Type) lListElem *ja_task - The task to remove (JAT_Type) order_t *orders - The order structure will be extended by one del order
int - Error code: 0 = OK, 1 = Errors
MT-NOTE: order_remove_immediate() is MT safe
order_remove_order_and_immediate() -- add a remove order for the job task
int order_remove_order_and_immediate(lListElem *job, lListElem *ja_task,
Generates an order of type ORT_remove_immediate_job for the given job task. Also removes the ORT_start_job order for this task from the order list.
lListElem *job - The job to remove (JB_Type) lListElem *ja_task - The task to remove (JAT_Type) order_t *orders - The order structurestructure for this scheduler pass be removed
int - Error code: 0 = OK, 1 = Errors
MT-NOTE: order_remove_order_and_immediate() is MT safe
remove_immediate_job() -- test for and remove immediate job which can't be scheduled
int remove_immediate_job(lList *job_list, lListElem *job, order_t *orders,
Removes immediate jobs which cannot be scheduled from the given job list. This is done by generating an order of type ORT_remove_immediate_job. If remove_orders is set, the ORT_start_job orders are first removed from the order list before adding the remove order.
lList *job_list - The list of jobs from which the job should be removed (JB_Type) lListElem *job - The job to remove (JB_Type) order_t *orders - The order structure for this scheduler pass int remove_orders - Whether the ORT_start_job orders should also be be removed
MT-NOTE: remove_immediate_job() is MT safe
remove_immediate_jobs() -- test for and remove immediate jobs which can't be scheduled
int remove_immediate_jobs(lList *pending_job_list,
Goes through all jobs in the pending list to see if any are immediate and not idle. If any are, they are removed. This is done by generating an order of type ORT_remove_immediate_job. If any array jobs are removed, the running list is checked for tasks belonging to the job, which are also removed. This is done by removing the ORT_start_job orders and adding an order of type ORT_remove_immediate_job.
lList *pending_job_list - The list of pending jobs for this scheduler pass (JB_Type) lList *running_job_list - The list of running jobs for this scheduler pass (JB_Type) order_t *orders - The order structure for this scheduler pass
int - Error code: 0 = OK, 1 = Errors -- always returns 0
MT-NOTE: remove_immediate_jobs() is MT safe
SERF_Implementation -- Functions that implement a generic schedule entry recording facility (SERF)
SERF -- Schedule entry recording facility
The enlisted functions below allow for plugging in any module that records schedule entries be used that registers through sge_serf_init() the following methods: typedef void (*record_schedule_entry_func_t)( u_long32 job_id, u_long32 ja_taskid, const char *state, u_long32 start_time, u_long32 end_time, char level_char, const char *object_name, const char *name, double utilization); typedef void (*new_schedule_func_t)(u_long32 time);
parallel_global_slots() --
dispatch_t - 0 ok got an assignment + set time for DISPATCH_TIME_QUEUE_END 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category
parallel_queue_slots() --
int - 0 ok got an assignment + set time for DISPATCH_TIME_NOW and DISPATCH_TIME_QUEUE_END (only with fixed_slot equals true) 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category
sequential_global_time() --
int - 0 ok got an assignment + set time for DISPATCH_TIME_QUEUE_END 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category
sequential_queue_time() --
dispatch_t - 0 ok got an assignment + set time for DISPATCH_TIME_NOW and DISPATCH_TIME_QUEUE_END (only with fixed_slot equals true) 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category
SPLIT_-Constants -- Constants used for split_jobs()
enum { SPLIT_FIRST, SPLIT_PENDING = SPLIT_FIRST, SPLIT_PENDING_EXCLUDED, SPLIT_PENDING_EXCLUDED_INSTANCES, SPLIT_SUSPENDED, SPLIT_WAITING_DUE_TO_PREDECESSOR, SPLIT_HOLD, SPLIT_ERROR, SPLIT_WAITING_DUE_TO_TIME, SPLIT_RUNNING, SPLIT_FINISHED, SPLIT_LAST };
SPLIT_PENDING - Pending jobs/tasks which may be dispatched SPLIT_PENDING_EXCLUDED - Pending jobs/tasks which won't be dispatched because this whould exceed 'max_u_jobs' SPLIT_PENDING_EXCLUDED_INSTANCES - Pending jobs/tasks which won't be dispatched because this whould exceed 'max_aj_instances' SPLIT_SUSPENDED - Suspended jobs/tasks SPLIT_WAITING_DUE_TO_PREDECESSOR - Jobs/Tasks waiting for others to finish SPLIT_HOLD - Jobs/Tasks in user/operator/system hold SPLIT_ERROR - Jobs/Tasks which are in error state SPLIT_WAITING_DUE_TO_TIME - These jobs/tasks are not dispatched because start time is in future SPLIT_RUNNING - These Jobs/Tasks won't be dispatched because they are already running SPLIT_FINISHED - Already finished jobs/tasks SPLIT_NOT_STARTED - jobs that could not be dispatched in one scheduling run SPLIT_FIRST and SPLIT_LAST might be used to build loops.
get_name_of_split_value() -- Constant to name transformation
const char* get_name_of_split_value(int value)
This function transforms a constant value in its internal name. (Used for debug output)
int value - SPLIT_-Constant
const char* - string representation of 'value'
job_get_duration() -- Determine a jobs runtime duration
bool job_get_duration(u_long32 *duration, const lListElem *jep)
The minimum of the time values the user specified with -l h_rt=<time> and -l s_rt=<time> is returned in 'duration'. If neither of these time values were specified the default duration is used.
u_long32 *duration - Returns duration on success const lListElem *jep - The job (JB_Type)
bool - true on success
MT-NOTE: job_get_duration() is MT safe
job_lists_split_with_reference_to_max_running()
void job_lists_split_with_reference_to_max_running( lList **job_lists[], lList **user_list, const char* user_name, int max_jobs_per_user)
Move those jobs which would exceed the configured 'max_u_jobs' limit (schedd configuration) from job_lists[SPLIT_PENDING] into job_lists[SPLIT_PENDING_EXCLUDED]. Only the jobs of the given 'user_name' will be handled. If 'user_name' is NULL than all jobs will be handled whose job owner is mentioned in 'user_list'.
lList **job_lists[] - Array of JB_Type lists lList **user_list - User list of Type JC_Type const char* user_name - user name int max_jobs_per_user - "max_u_jobs"
JC_jobs of the user elements contained in "user_list" has to be initialized properly before this function might be called.
job_move_first_pending_to_running() -- Move a job
void job_move_first_pending_to_running(lListElem **pending_job, lList **splitted_jobs[])
Move the 'pending_job' from 'splitted_jobs[SPLIT_PENDING]' into 'splitted_jobs[SPLIT_RUNNING]'. If 'pending_job' is an array job, than the first task (task id) will be moved into 'pending_job[SPLIT_RUNNING]'
lListElem **pending_job - Pointer to a pending job (JB_Type) lList **splitted_jobs[] - (JB_Type) array of job lists
bool - true, if the pending job was removed
split_jobs() -- Split list of jobs according to their state
void split_jobs(lList **job_list, lList **answer_list, u_long32 max_aj_instances, lList **result_list[])
Split a list of jobs according to their state. 'job_list' is the input list of jobs. The jobs in this list have different job states. For the dispatch algorithm only those jobs are of interest which are really pending. Jobs which are pending and in error state or jobs which have a hold applied (start time in future, administrator hold, ...) are not necessary for the dispatch algorithm. After a call to this function the jobs of 'job_list' may have been moved into one of the 'result_list's. Each of those lists containes jobs which have a certain state. (e.g. result_list[SPLIT_WAITING_DUE_TO_TIME] will contain all jobs which have to wait according to their start time. 'max_aj_instances' are the maximum number of tasks of an array job which may be instantiated at the same time. 'max_aj_instances' is used for the split decitions. In case of any error the 'answer_list' will be used to report errors (It is not used in the moment)
lList **job_list - JB_Type input list u_long32 max_aj_instances - max. num. of task instances lList **result_list[] - Array of result list (JB_Type)
In former versions of SGE/EE we had 8 split functions. Each of those functions walked twice over the job list. This was time consuming in case of x thousand of jobs. We tried to improve this: - loop over all jobs only once - minimize copy operations where possible Unfortunately this function is heavy to understand now. Sorry!
trash_splitted_jobs() -- Trash all not needed job lists
void trash_splitted_jobs(lList **splitted_job_lists[])
Trash all job lists which are not needed for scheduling decisions. Before jobs and lists are trashed, scheduling messages will be generated. Following lists will be trashed: splitted_job_lists[SPLIT_ERROR] splitted_job_lists[SPLIT_HOLD] splitted_job_lists[SPLIT_WAITING_DUE_TO_TIME] splitted_job_lists[SPLIT_WAITING_DUE_TO_PREDECESSOR] splitted_job_lists[SPLIT_PENDING_EXCLUDED_INSTANCES] splitted_job_lists[SPLIT_PENDING_EXCLUDED]
lList **splitted_job_lists[] - list of job lists
user_list_init_jc() -- inc. the # of jobs a user has running
void user_list_init_jc(lList **user_list, const lList *running_list)
Initialize "user_list" and JC_jobs attribute for each user according to the list of running jobs.
lList **user_list - JC_Type list const lList *running_list - JB_Type list
void - None
schedd_mes_add() -- Add one entry into the message structure.
void schedd_mes_add(u_long32 job_number, u_long32 message_number, ...)
During the time the scheduler trys to dispatch jobs it might call this function to add messages into a temporary structure. This function might be called several times. Each call will add one element which containes one message describing a reason, why a job can't be dispatched and the concerned jid. When it is clear if the job could be dispatched or not, one of following functions has to be called: schedd_mes_commit() schedd_mes_rollback()
u_long32 job_number - job id u_long32 message_number - message number (sge_schedd_text.h) ... - arguments for format string sge_schedd_text(message_number)
MT-NOTE: schedd_mes_add() is MT safe
schedd_mes_add_global() -- add a global message
void schedd_mes_add_global(u_long32 message_number, ...)
Add a global message into a message structure.
u_long32 message_number - message number (sge_schedd_text.h) ... - arguments for format string sge_schedd_text(message_number)
MT-NOTE: schedd_mes_add_global() is MT safe
schedd_mes_commit() -- Complete message elements and move them
void schedd_mes_commit(lList *job_list, int ignore_category)
Each message contained in "tmp_sme" containes only one job id. We have to find other jobs in "job_list" and add the job ids to the list of ids contained in "tmp_sme" message elements. After that we have to move all messages contained in "tmp_sme" into "sme". If "ignore_category" is 1 than the job category will be ignored. This means thal all ids of "job_list" will be added to all messages contained in "tmp_sme". If no category is passed in and ignore_category is false, the messages are only generated for the current job, meaning, they are just copied.
lList *job_list - JB_Type list int ignore_category - if set to true, the messages with be generated for all jobs in the list lRef jid_category - if not NULL, the function uses the category to ensure, that every message is only added per category once.
schedd_mes_initialize() -- Initialize module variables
void schedd_mes_initialize(void)
Initialize module variables
schedd_mes_obtain_package() -- Get message structure
lListElem *schedd_mes_obtain_packagevoid)
Returns message structure which containes all messages.
int *global_mes_count - out: returns nr of global messages int *job_mes_count - out: returns nr of job messages
The calling function is responsible to free the returned message structure if it is not needed anymore.
lListElem* - SME_Type element
schedd_mes_rollback() -- Free temporaryly generated messages
void schedd_mes_rollback(void)
Free temporaryly generated messages contained in "tmp_sme".
schedd_mes_add_join() -- same as schedd_mes_add, but joins messages based on the message id.
void schedd_mes_add_join(u_long32 job_number, u_long32 message_number, ...)
same as schedd_mes_add, but joins messages based on the message id. But it only uses the temp message list and not the global one.
u_long32 job_number - job id u_long32 message_number - message number (sge_schedd_text.h) ... - arguments for format string sge_schedd_text(message_number)
MT-NOTE: schedd_mes_add_join() is MT safe
schedd_mes_get_tmp_list() -- gets all messages for the current job
lList* schedd_mes_get_tmp_list()
returns a list of all messages for the current job
lList* - message list
schedd_mes_set_tmp_list() -- sets the messages for a current job
void schedd_mes_set_tmp_list(lListElem *category, int name, int name, u_long32 job_number)
Takes a mesage list, changes the job number to the current job and stores the list.
lListElem *category - an object, which stores the list int name - element id for the list u_long32 job_number - job number
Simple-Scheduler-Interface -- Interface for custom schedulers
SGE provides a very simple interface to custom schedulers. Such scheduler can be created using the event client or the event mirror interface. The interface provides functions to start a job and to delete a job. It was created to allow an easier integration of the MAUI scheduler into Grid Engine.
-Simple-Scheduler-Interface-Typedefs -- typedefs for the SSI
typedef struct { int procs; const char *host_name; } task_map;
With a task_map a jobs structure is described. A job can be spawned over an arbitrary number of hosts. A job has an arbitrary number of tasks per host. An array of task_map is used to pass information to ssi functions. It can contain any number of entries, the last entry has to contain 0 as procs.
sge_ssi_job_cancel() -- delete or restart a job
bool sge_ssi_job_cancel(const char *job_identifier, bool reschedule)
Delete the given job. If reschedule is set to true, reschedule the job.
const char *job_identifier - job identifier in the form <jobid>.<ja_task_id>, e.g. 123.1 bool reschedule - if true, reschedule job
bool - true, if the job could be successfully deleted (rescheduled), else false.
The reschedule parameter is igored in the current implementation.
sge_ssi_job_start() -- start a job
bool sge_ssi_job_start(const char *job_identifier, const char *pe, task_map tasks[])
Start the job described by job_identifier, pe and tasks. job_identifier has to be given in the form "<job_id>.<ja_task_id>", e.g. "123.1" and must reference a pending job/array task. For parallel jobs, pe has to be the name of an existing parallel environment. tasks describes how many tasks are to be started per host. The function creates a scheduling order and sends it to qmaster.
const char *job_identifier - unique job identifier const char *pe - name of a parallel environment or NULL for sequential jobs task_map tasks[] - mapping host->number of tasks
bool - true on success, else false
parallel_maximize_slots_pe() -- Maximize number of slots for an assignment
static int parallel_maximize_slots_pe(sge_assignment_t *best, lList *host_list, lList *queue_list, lList *centry_list, lList *acl_list)
The largest possible slot amount is searched for a job assuming a particular parallel environment is used at a particular start time. If the slot number passed is 0 we start with the minimum possible slot number for that job. To search most efficiently for the right slot value, it has three search strategies implemented: - binary search - least slot value first - highest slot value first To be able to use binary search all possible slot values are stored in one array. The slot values in this array are sorted ascending. After the right slot value is found, it is very easy to compute the best strategy from the result. For each strategy it will compute how many iterations would have been needed to compute the correct result. These steps will be stored for the next run and used to figure out the best algorithm. To ensure that we can adapt to rapid changes and also ignore spikes we are using the running average algorithm in a 80-20 setting. This means that the algorithm will need 4 (max 5) iterations to adopt to a new scenario. Further enhancements: It might be a good idea to store the derived values with the job categories and allow finding the best strategy per category.
sge_assignment_t *best - herein we keep all important in/out information lList *host_list - a list of all available hosts lList *queue_list - a list of all available queues lList *centry_list - a list of all available complex attributes lList *acl_list - a list of all access lists
int - 0 ok got an assignment (maybe without maximizing it) 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: parallel_maximize_slots_pe() is MT safe as long as the provided lists are owned be the caller SEE ALSO: sconf_best_pe_alg sconf_update_pe_alg add_pe_slots_to_category
parallel_reservation_max_time_slots() -- Search earliest possible assignment
static dispatch_t parallel_reservation_max_time_slots(sge_assignment_t *best)
The earliest possible assignment is searched for a job assuming a particular parallel environment be used with a particular slot number. If the slot number passed is 0 we start with the minimum possible slot number for that job. The search starts with the latest queue end time if DISPATCH_TIME_QUEUE_END was specified rather than a real time value.
sge_assignment_t *best - herein we keep all important in/out information
dispatch_t - 0 ok got an assignment 1 no assignment at the specified time (???) -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: parallel_reservation_max_time_slots() is not MT safe
sge_select_parallel_environment() -- Decide about a PE assignment
static dispatch_t sge_select_parallel_environment(sge_assignment_t *best, lList *pe_list)
When users use wildcard PE request such as -pe <pe_range> 'mpi8_*' more than a single parallel environment can match the wildcard expression. In case of 'now' assignments the PE that gives us the largest assignment is selected. When scheduling a reservation we search for the earliest assignment for each PE and then choose that one that finally gets us the maximum number of slots.
The scheduler info messages are not cached. They are added globally and have to be added for each job in the category. When the messages are updated this has to be changed.
sge_assignment_t *best - herein we keep all important in/out information lList *pe_list - the list of all parallel environments (PE_Type)
dispatch_t - 0 ok got an assignment 1 no assignment at the specified time (???) -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: sge_select_parallel_environment() is not MT safe
build_name_filter() -- fills in an array with complex nams, which can be used as a filter.
void build_name_filter(const char **filter, lList *list, int t_name, int *pos)
Takes an array of a given size and fills in complex names.
const char **filter - target for the filter strings. It has to be of sufficant size. lList *list - a list of complexes, from which the names are extracted int t_name - specifies the field which is used as a name
???
get_attribute_list() -- generates a list for all defined elements in a queue, host, global
static lList* get_attribute_list(lListElem *global, lListElem *host, lListElem *queue, lList *centry_list)
Generates a list for all attributes defined at the given queue, host, global.
lListElem *global - global host lListElem *host - host (or NULL, if only global attributes are important) lListElem *queue - queue (or NULL if only host/global attributes are important) lList *centry_list - system wide attribute config list
static lList* - list of attributes or NULL, if no attributes exist.
get_attribute_list_by_names() -- generates a list of attributes from the given names
static lList* get_attribute_list_by_names(lListElem *global, lListElem *host, lListElem *queue, lList *centry_list, lList *attrnames)
Assembles a list of attributes for a given queue, host, global, which contains all the specified elements. The general sort order is, global, host, queue. If an element could not be found, it will not exist. If no elements exist, the function will return NULL
lListElem *global - global host lListElem *host - host (or NULL, if only global resources are asked for ) lListElem *queue - queue (or NULL, if only global / host resources are asked for) lList *centry_list - the system wide attribut config list lList *attrnames - ST_Type list of attribute names
static lList* - a CULL list of elements or NULL
is_attr_prior2() -- checks, if the set value in the structure has a higher priority than the new one
static bool is_attr_prior2(lListElem *upper_el, double lower_value, int t_value, int t_dominant)
Computes the priority between a given structure and its values and a new value. This is done on some basic rules. If the value is not set (dominant == DOMINANT_TYPE_VALUE) or which relational opperator is used. If this is not enough, the two values are compared and based on the opperator, it returns a true or false: if no value is set in the structure: false if the relops are == or != : true if the relops are >= or > : true, when the new value is smaller than the old one if the relops are <= or < : true, when the new value is bigger than the old one
lListElem *upper_el - target structure double lower_value - new value int t_value - which field to use (CE_doubleval or CE_pj_doubleval) int t_dominant - which dominant field to use (CE_dominant, CE_pj_dominant)
static bool - true, if the value in the structure has the higher priority
request_cq_rejected() -- Check, if -l request forecloses cluster queue
bool request_cq_rejected(const lList* hard_resource_list, const lListElem *cq, const lList *centry_list, dstring *unsatisfied)
Do -l matching with the aim to foreclose the entire cluster queue. Each cluster queue configuration profile must specify a fixed value otherwise we can't rule out a cluster queue. Both complex_values and queue resource limits are checked.
const lList* hard_resource_list - resource list -l (CE_Type) const lListElem *cq - cluster queue (CQ_Type) const lList *centry_list - complex entry list (CE_Type) dstring *unsatisfied - diagnosis information, if rejected
bool - true, if the cluster queue is ruled out
MT-NOTE: request_cq_rejected() is MT safe
sge_dlib() -- lookup, load, and cache function from a dynamic library
void *sge_dlib(const char *key, const char *lib_name, const char *fn_name, lib_cache_t **lib_cache_list)
const char *key - unique key for identifying function const char *lib_name - dynamic library name const char *fn_nam - function name lib_cache_t **lib_cache_list - cache list (if NULL, we use a global cache)
void * - the address of the function
MT-NOTE: sge_free_load_list() is not MT safe
sge_job_slot_request() -- return static urgency jobs slot request
int sge_job_slot_request(lListElem *job, lList *pe_list)
For sequential jobs the static urgency job slot request is always 1. For parallel jobs the static urgency job slot request depends on static urgency slots as defined with sge_pe(5).
lListElem *job - the job (JB_Type) lList *pe_list - the PE list (PE_Type)
int - Number of slots
In case of a wildcard parallel environment request the setting of the first matching is used. Behaviour is undefined if multiple parallel environments specify different settings!
task_get_duration() -- Determin tasks effective runtime limit
bool task_get_duration(u_long32 *duration, const lListElem *ja_task)
Determines the effictive runtime limit got by requested h_rt/s_rt or by the resulting queues h_rt/s_rt
u_long32 *duration - tasks duration in seconds const lListElem *ja_task - task element
bool - true
MT-NOTE: task_get_duration() is MT safe
sge_GetNumberOfOrders() -- returns the number of orders generated
int sge_GetNumberOfOrders(order_t *orders)
returns the number of orders generated
order_t *orders - a structure of orders
int - number of orders in the structure
MT-NOTE: sge_GetNumberOfOrders() is MT safe
sge_add_schedd_info() -- retrieves the messages and generates an order out of it.
lList* sge_add_schedd_info(lList *or_list, int *global_mes_count, int *job_mes_count)
retrieves all messages, puts them into an order package, and frees the orginal messages. It also returns the number of global and job messages.
lList *or_list - int: the order list to which the message order is added int *global_mes_count - out: global message count int *job_mes_count - out: job message count
lList* - the order list
MT-NOTE: sge_add_schedd_info() is not MT safe
sge_create_orders() -- Create a new order-list or add orders to an existing one
lList* sge_create_orders(lList *or_list, u_long32 type, lListElem *job, lListElem *ja_task, lList *granted, bool update_execd)
- If the or_list is NULL, a new one will be generated - in case of a clear_pri order, teh ja_task is improtant. If NULL is put in for ja_task, only the pendin tasks of the spedified job are set to NULL. If a ja_task is put in, all tasks of the job are set to NULL
lList *or_list - the order list u_long32 type - order type lListElem *job - job lListElem *ja_task - ja_task ref or NULL(there is only one case, where it can be NULL) lList *granted - granted queue list bool update_execd - should the execd get new ticket values?
lList* - returns the orderlist
MT-NOTE: sge_create_orders() is MT safe
sge_join_orders() -- generates one order list from the order structure
lLlist* sge_join_orders(order_t orders)
generates one order list from the order structure, and cleans the the order structure. The orders, which have been send already, are removed.
order_t orders - the order strucutre
lLlist* - a order list
MT-NOTE: sge_join_orders() is not safe
pe_match_static() -- Why not job to PE?
int pe_match_static(lListElem *job, lListElem *pe, lList *acl_list, bool only_static_checks)
Checks if PE is suited for the job.
lListElem *job - ??? lListElem *pe - ??? lList *acl_list - ??? bool only_static_checks - ???
dispatch_t - DISPATCH_OK ok DISPATCH_NEVER_CAT assignment will never be possible for all jobs of that category
MT-NOTE: pe_restricted() is not MT safe
sge_qeti_list_add() -- Adds a resource utilization to QETI resource list
static int sge_qeti_list_add(lList **lpp, const char *name, lList* rue_lp, double total, bool must_exist)
???
lList **lpp - QETI resource list const char *name - Name of the resource lList* rue_lp - Resource utilization entry (RUE_Type) double total - Total resource amount bool must_exist - If true the entry must exist in 'lpp'.
static int - 0 on success
MT-NOTE: sge_qeti_list_add() is not MT safe
sge_qeti_next_before() -- ???
void sge_qeti_next_before(sge_qeti_t *qeti, u_long32 start)
All queue end next references are set in a way that will sge_qeti_next() return a time value that is before (i.e. less than) start.
sge_qeti_t *qeti - ??? u_long32 start - ???
MT-NOTE: sge_qeti_next_before() is MT safe
check_and_debit_rqs_slots() -- Determine RQS limit slot amount and debit
static void check_and_debit_rqs_slots(sge_assignment_t *a, const char *host, const char *queue, int *slots, int *slots_qend, dstring *rule_name, dstring *rue_name, dstring *limit_name)
The function determines the final slot and slots_qend amount due to all resource quota limitations that apply for the queue instance. Both slot amounts get debited from the a->limit_list to keep track of still available amounts per resource quota limit.
sge_assignment_t *a - Assignment data structure const char *host - hostname const char *queue - queuename int *slots - needed/available slots int *slots_qend - needed/available slots_qend dstring *rule_name - caller maintained buffer dstring *rue_name - caller maintained buffer dstring *limit_name - caller maintained buffer
MT-NOTE: check_and_debit_rqs_slots() is MT safe
cqueue_shadowed() -- Check for cluster queue rule before current rule
static bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a)
Check whether there is any cluster queue specific rule before the current rule.
const lListElem *rule - Current rule sge_assignment_t *a - Scheduler assignment
static bool - True if shadowed
limit queue Q001 to F001=1 limit host gridware to F001=0 (--> returns 'true' due to 'Q001' meaning that gridware can't be generally ruled out )
MT-NOTE: cqueue_shadowed() is MT safe
cqueue_shadowed_by() -- Check rules shadowing current cluster queue rule
static bool cqueue_shadowed_by(const char *cqname, const lListElem *rule, sge_assignment_t *a)
Check if cluster queue in current rule is shadowed.
const char *cqname - Cluster queue name to check const lListElem *rule - Current rule sge_assignment_t *a - Assignment
static bool - True if shadowed
limits queues Q001,Q002 to F001=1 limits queues Q002,Q003 to F001=1 (--> returns 'true' for Q002 and 'false' for Q003)
MT-NOTE: cqueue_shadowed_by() is MT safe
debit_job_from_rqs() -- debits job in all relevant resource quotas
int debit_job_from_rqs(lListElem *job, lList *granted, lListElem* pe, lList *centry_list)
The function debits in all relevant rule the requested amout of resources.
lListElem *job - job request (JB_Type) lList *granted - granted list (JG_Type) lListElem* pe - granted pe (PE_Type) lList *centry_list - consumable resouces list (CE_Type)
int - always 0
MT-NOTE: debit_job_from_rqs() is not MT safe
host_shadowed() -- Check for host rule before current rule
static bool host_shadowed(const lListElem *rule, sge_assignment_t *a)
Check whether there is any host specific rule before the current rule.
const lListElem *rule - Current rule sge_assignment_t *a - Scheduler assignment
static bool - True if shadowed
limit host gridware to F001=1 limit queue Q001 to F001=0 (--> returns 'true' due to 'gridware' meaning that Q001 can't be generally ruled out )
MT-NOTE: host_shadowed() is MT safe
host_shadowed_by() -- ???
static bool host_shadowed_by(const char *host, const lListElem *rule, sge_assignment_t *a)
Check if host in current rule is shadowed.
const char *cqname - Host name to check const lListElem *rule - Current rule sge_assignment_t *a - Assignment
static bool - True if shadowed
limits hosts host1,host2 to F001=1 limits hosts host2,host3 to F001=1 (--> returns 'true' for host2 and 'false' for host3)
MT-NOTE: host_shadowed_by() is MT safe
is_cqueue_expand() -- Returns true if rule expands on cluster queues
bool is_cqueue_expand(const lListElem *rule)
Returns true if rule expands on cluster queues.
const lListElem *rule - RQR_Type
bool - True if rule expands on hosts
"queues {*}" returns true "queues Q001,Q002" returns false
MT-NOTE: is_cqueue_expand() is MT safe
is_cqueue_global() -- Global rule with regards to cluster queues?
bool is_cqueue_global(const lListElem *rule)
const lListElem *rule - RQR_Type
bool - True if cluster queues play no role with the rule
MT-NOTE: is_cqueue_global() is MT safe
is_host_expand() -- Returns true if rule expands on hosts
bool is_host_expand(const lListElem *rule)
Returns true if rule expands on hosts.
const lListElem *rule - RQR_Type
bool - True if rule expands on hosts
"hosts {*}" returns true "hosts @allhosts" returns false
MT-NOTE: is_host_expand() is MT safe
is_host_global() -- Global rule with regards to hosts?
bool is_host_global(const lListElem *rule)
Return true if hosts play no role with the rule
const lListElem *rule - RQR_Type
bool - True if hosts play no role with the rule
MT-NOTE: is_host_global() is MT safe
parallel_limit_slots_by_time() -- Determine number of slots avail. within time frame
static dispatch_t parallel_limit_slots_by_time(const sge_assignment_t *a, lList *requests, int *slots, int *slots_qend, lListElem *centry, lListElem *limit, dstring rue_name)
???
const sge_assignment_t *a - job info structure (in) lList *requests - Job request list (CE_Type) int *slots - out: free slots int *slots_qend - out: free slots in the far far future lListElem *centry - Load information for the resource lListElem *limit - limitation (RQRL_Type) dstring rue_name - rue_name saved in limit sublist RQRL_usage lListElem *qep - queue instance (QU_Type)
static dispatch_t - DISPATCH_OK got an assignment - DISPATCH_NEVER_CAT no assignment for all jobs of that category
MT-NOTE: parallel_limit_slots_by_time() is not MT safe
parallel_rqs_slots_by_time() -- Determine number of slots avail within time frame
dispatch_t parallel_rqs_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend, const char *host, const char *queue)
This function iterates for a queue instance over all resource quota sets and evaluates the number of slots available.
const sge_assignment_t *a - job info structure (in) int *slots - out: # free slots int *slots_qend - out: # free slots in the far far future lListElem *qep - QU_Type Elem
static dispatch_t - DISPATCH_OK got an assignment - DISPATCH_NEVER_CAT no assignment for all jobs of that category
MT-NOTE: parallel_rqs_slots_by_time() is not MT safe
rqs_by_slots() -- Check queue instance suitability due to RQS
dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue, const char *host, u_long32 *tt_rqs_all, bool *is_global, dstring *rue_string, dstring *limit_name, dstring *rule_name)
Checks (or determines earliest time) queue instance suitability according to resource quota set limits. For performance reasons RQS verification results are cached in a->limit_list. In addition unsuited queues and hosts are collected in a->skip_cqueue_list and a->skip_host_list so that ruling out chunks of queue instance becomes quite cheap.
sge_assignment_t *a - assignment const char *queue - cluster queue name const char *host - host name u_long32 *tt_rqs_all - returns earliest time over all resource quotas bool *is_global - returns true if result is valid for any other queue dstring *rue_string - caller maintained buffer dstring *limit_name - caller maintained buffer dstring *rule_name - caller maintained buffer u_long32 tt_best - time of best solution found so far
static dispatch_t - usual return values
MT-NOTE: rqs_by_slots() is MT safe
rqs_can_optimize() -- Poke whether a queue/host negation can be made
static void rqs_can_optimize(const lListElem *rule, bool *host, bool *queue, sge_assignment_t *a)
A global limit was hit with 'rule'. This function helps to determine to what extent we can profit from that situation. If there is no previous matching rule within the same rule set any other queue/host can be skipped.
const lListElem *rule - Rule bool *host - Any previous rule with a host scope? bool *queue - Any previous rule with a queue scope? sge_assignment_t *a - Scheduler assignment
MT-NOTE: rqs_can_optimize() is MT safe
rqs_exceeded_sort_out() -- Rule out queues/hosts whenever possible
bool rqs_exceeded_sort_out(sge_assignment_t *a, const lListElem *rule, const dstring *rule_name, const char* queue_name, const char* host_name)
This function tries to rule out hosts and cluster queues after a quota exceeding was found for a limitation rule with specific queue instance. When a limitation was exceeded that applies to the entire cluster 'true' is returned, 'false' otherwise.
sge_assignment_t *a - Scheduler assignment type const lListElem *rule - The exceeded rule const dstring *rule_name - Name of the rule (monitoring only) const char* queue_name - Cluster queue name const char* host_name - Host name
bool - True upon global limits exceeding
MT-NOTE: rqs_exceeded_sort_out() is MT safe
rqs_exceeded_sort_out_par() -- Rule out queues/hosts whenever possible
void rqs_exceeded_sort_out_par(sge_assignment_t *a, const lListElem *rule, const dstring *rule_name, const char* queue_name, const char* host_name)
Function wrapper around rqs_exceeded_sort_out() for parallel jobs. In contrast to the sequential case global limit exceeding is handled by adding all cluster queue names to the a->skip_cqueue_list.
sge_assignment_t *a - Scheduler assignment type const lListElem *rule - The exceeded rule const dstring *rule_name - Name of the rule (monitoring only) const char* queue_name - Cluster queue name const char* host_name - Host name
MT-NOTE: rqs_exceeded_sort_out_par() is MT safe
rqs_excluded_cqueues() -- Find excluded queues
static void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a)
Find queues that are excluded by previous rules.
const lListElem *rule - The rule sge_assignment_t *a - Scheduler assignment
limit projects {*} queues !Q001 to F001=1 limit to F001=0 ( ---> returns Q001 in a->skip_cqueue_list)
MT-NOTE: rqs_excluded_cqueues() is MT safe
rqs_excluded_hosts() -- Find excluded hosts
static void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a)
Find hosts that are excluded by previous rules.
const lListElem *rule - The rule sge_assignment_t *a - Scheduler assignment
limit projects {*} queues !gridware to F001=1 limit to F001=0 ( ---> returns gridware in skip_host_list)
MT-NOTE: rqs_excluded_hosts() is MT safe
rqs_expand_cqueues() -- Add all matching cqueues to the list
void rqs_expand_cqueues(const lListElem *rule)
The names of all cluster queues that match the rule are added to the skip list without duplicates.
const lListElem *rule - RQR_Type
MT-NOTE: rqs_expand_cqueues() is not MT safe
rqs_expand_hosts() -- Add all matching hosts to the list
void rqs_expand_hosts(const lListElem *rule, lList **skip_host_list, const lList *host_list, lList *hgrp_list)
The names of all hosts that match the rule are added to the skip list without duplicates.
const lListElem *rule - RQR_Type const lList *host_list - EH_Type
MT-NOTE: rqs_expand_hosts() is MT safe
rqs_limitation_reached() -- is the limitation reached for a queue instance
static bool rqs_limitation_reached(sge_assignment_t *a, lListElem *rule, const char* host, const char* queue)
The function verifies no limitation is reached for the specific job request and queue instance
sge_assignment_t *a - job info structure const lListElem *rule - resource quota rule (RQR_Type) const char* host - host name const char* queue - queue name u_long32 *start - start time of job
static dispatch_t - DISPATCH_OK job can be scheduled DISPATCH_NEVER_CAT no jobs of this category will be scheduled DISPATCH_NOT_AT_TIME job can be scheduled later DISPATCH_MISSING_ATTR rule does not match requested attributes
MT-NOTE: rqs_limitation_reached() is not MT safe
rqs_match_assignment() -- match resource quota rule any queue instance
static bool rqs_match_assignment(const lListElem *rule, sge_assignment_t *a)
Check whether a resource quota rule can match any queue instance. If if does not match due to users/projects/pes scope one can rule this out. Note: As long as rqs_match_assignment() is not used for parallel jobs passing NULL as PE request is perfectly fine.
const lListElem *rule - Resource quota rule sge_assignment_t *a - Scheduler assignment
static bool - True if it matches
MT-NOTE: rqs_match_assignment() is MT safe
rqs_set_dynamical_limit() -- evaluate dynamical limit
bool rqs_set_dynamical_limit(lListElem *limit, lListElem *global_host, lListElem *exec_host, lList *centry)
The function evaluates if necessary the dynamical limit for a host and sets the evaluated double value in the given limitation element (RQRL_dvalue). An evaluation is necessary if the limit boolean RQRL_dynamic is true. This field is set by qmaster during the rule set verification
lListElem *limit - limitation (RQRL_Type) lListElem *global_host - global host (EH_Type) lListElem *exec_host - exec host (EH_Type) lList *centry - consumable resource list (CE_Type)
bool - always true
MT-NOTE: rqs_set_dynamical_limit() is MT safe
sge_user_is_referenced_in_rqs() -- search for user reference in rqs
bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user, lList *acl_list)
Search for a user reference in the resource quota sets
const lList *rqs - resource quota set list const char *user - user to search const char *group - user's group lList *acl_list - acl list for user resolving
bool - true if user was found false if user was not found
MT-NOTE: sge_user_is_referenced_in_rqs() is MT safe
add_calendar_to_schedule() -- addes the queue calendar to the resource schedule
static void add_calendar_to_schedule(lList *queue_list)
Adds the queue calendars to the resource schedule. It is using the slot entry for simulating and enabled / disabled calendar.
lList *queue_list - all queues, which can posibly run jobs u_long32 now - now time of assignment
MT-NOTE: add_calendar_to_schedule() is MT safe
add_job_utilization() -- Debit assignments' utilization from all schedules
int add_job_utilization(const sge_assignment_t *a, const char *type)
The resouce utilization of an assignment is debited into the schedules of global, host and queue instance resource containers and limitation rule sets. For parallel jobs debiting is made done from the parallel environment schedule.
const sge_assignment_t *a - The assignment const char *type - A string that is used to monitor assignment type bool for_job_scheduling - utilize for job or for advance reservation
int -
MT-NOTE: add_job_utilization() is MT safe
newResourceElem() -- creates new resource schedule entry
static lListElem* newResourceElem(u_long32 time, double amount)
creates new resource schedule entry and returns it
u_long32 time - specific time double amount - the utilized amount
static lListElem* - new resource schedule entry
MT-NOTE: newResourceElem() is MT safe
prepare_resource_schedules() -- Debit non-pending jobs in resource schedule
static void prepare_resource_schedules(const lList *running_jobs, const lList *suspended_jobs, lList *pe_list, lList *host_list, lList *queue_list, lList *centry_list, lList *rqs_list)
In order to reflect current and future resource utilization of running and suspended jobs in the schedule we iterate through all jobs and debit resources requested by those jobs.
const lList *running_jobs - The running ones (JB_Type) const lList *suspended_jobs - The susepnded ones (JB_Type) lList *pe_list - ??? lList *host_list - ??? lList *queue_list - ??? lList *rqs_list - configured resource quota sets lList *centry_list - ??? lList *acl_list - ??? lList *hgroup_list - ??? lList *prepare_resource_schedules - create schedule for job or advance reservation scheduling bool for_job_scheduling - prepare for job or for advance reservation u_long32 now - now time of assignment
MT-NOTE: prepare_resource_schedules() is not MT safe
rqs_add_job_utilization() -- Debit assignment's utilization in a limitation rule
static int rqs_add_job_utilization(lListElem *jep, u_long32 task_id, const char *type, lListElem *rule, dstring rue_name, lList *centry_list, int slots, const char *obj_name, u_long32 start_time, u_long32 end_time, bool is_master_task)
???
lListElem *jep - job element (JB_Type) u_long32 task_id - task id to debit const char *type - String denoting type of utilization entry lListElem *rule - limitation rule (RQR_Type) dstring rue_name - rue_name where to debit lList *centry_list - master centry list (CE_Type) int slots - slots to debit const char *obj_name - name of the object where to debit u_long32 start_time - start time of utilization u_long32 end_time - end time of utilization bool is_master_task - is this the master task going to be debit
static int - number of modified limits
MT-NOTE: rqs_add_job_utilization() is MT safe
serf_exit() -- Closes SERF
void serf_exit(void)
All operations requited to cleanly shutdown the SERF are done.
MT-NOTE: serf_exit() is MT safe
serf_init() -- Initializes SERF
void serf_init(record_schedule_entry_func_t write, new_schedule_func_t newline)
MT-NOTE: serf_init() is not MT safe
serf_new_interval() -- Indicate a new scheduling run
void serf_new_interval(u_long32 time)
When a new scheduling run is started serf_new_interval() shall be called to indicate this. This allows assigning of schedule entry records to different schedule runs.
u_long32 time - The time when the schedule run was started.
MT-NOTE: (1) serf_new_interval() is MT safe if no recording function MT-NOTE: was registered via serf_init(). MT-NOTE: (2) Otherwise MT safety of serf_new_interval() depends on MT-NOTE: MT safety of registered recording function
serf_record_entry() -- Add a new schedule entry record
void serf_record_entry(u_long32 job_id, u_long32 ja_taskid, const char *state, u_long32 start_time, u_long32 end_time, char level_char, const char *object_name, const char *name, double utilization)
The entirety of all information passed to this function describes the schedule that was created during a scheduling interval of a Grid Engine scheduler. To reflect multiple resource debitations of a job multiple calls to serf_record_entry() are required. For parallel jobs the serf_record_entry() is called one times with a 'P' as level_char.
u_long32 job_id - The job id u_long32 ja_taskid - The task id const char *type - A string indicating the reason why the utilization was put into the schedule: RUNNING - Job was running before scheduling run SUSPENDED - Job was suspended before scheduling run MIGRATING - Job being preempted (unused) STARTING - Job will be started RESERVING - Job reserves resources u_long32 start_time - Start of the resource utilization u_long32 end_time - End of the resource utilization char level_char - Q - Queue H - Host G - Global P - Parallel Environment (PE) const char *object_name - Name of Queue/Host/Global/PE const char *name - Resource name double utilization - Utilization amount
MT-NOTE: (1) serf_record_entry() is MT safe if no recording function MT-NOTE: was registered via serf_init(). MT-NOTE: (2) Otherwise MT safety of serf_record_entry() depends on MT-NOTE: MT safety of registered recording function
set_utilization() -- adds one specific calendar entry to the resource schedule
static void set_utilization(lList *uti_list, u_long32 from, u_long32 till, double uti)
This set utilization function is unique for calendars. It removes all other uti settings in the given time interval and replaces it with the given one.
lList *uti_list - the uti list for a specifiy resource and queue u_long32 from - starting time for this uti u_long32 till - endtime for this uti. double uti - utilization (needs to bigger than 1 (schould be max)
MT-NOTE: set_utilization() is MT safe
sge_qeti_first() --
u_long32 sge_qeti_first(sge_qeti_t *qeti)
Initialize/Reinitialize Queue End Time Iterator. All queue end next references are initialized to the queue end of all resourece instances. Before we return the time that is most in the future queue end next references are switched to the next entry that is earlier than the time that was returned.
sge_qeti_t *qeti - ???
u_long32 -
MT-NOTE: sge_qeti_first() is MT safe
sge_qeti_next() -- ???
u_long32 sge_qeti_next(sge_qeti_t *qeti)
Return next the time that is most in the future. Then queue end next references are switched to the next entry that is earlier than the time that was returned.
sge_qeti_t *qeti - ???
u_long32 -
MT-NOTE: sge_qeti_next() is MT safe
sge_qeti_release() -- Release queue end time iterator
void sge_qeti_release(sge_qeti_t *qeti)
Release all resources of the queue end time iterator. Refered resource utilization diagrams are not affected.
sge_qeti_t *qeti - ???
MT-NOTE: sge_qeti_release() is MT safe
utilization_add() -- Debit a jobs resource utilization
int utilization_add(lListElem *cr, u_long32 start_time, u_long32 duration, double utilization, u_long32 job_id, u_long32 ja_taskid, u_long32 level, const char *object_name, const char *type)
A jobs resource utilization is debited into the resource utilization diagram at the given time for the given duration.
lListElem *cr - Resource utilization entry (RUE_Type) u_long32 start_time - Start time of utilization u_long32 duration - Duration double utilization - Amount u_long32 job_id - Job id u_long32 ja_taskid - Task id u_long32 level - *_TAG const char *object_name - The objects name const char *type - String denoting type of utilization entry. bool is_job - reserve for job or for advance reservation bool implicit_non_exclusive - add implicit entry for non-exclusive jobs requesting a exclusive centry
int - 0 on success
MT-NOTE: utilization_add() is not MT safe
utilization_below() -- Determine earliest time util is below max_util
u_long32 utilization_below(const lListElem *cr, double max_util, const char *object_name)
Determine and return earliest time utilization is below max_util.
const lListElem *cr - Resource utilization entry (RUE_utilized) double max_util - The maximum utilization we're asking const char *object_name - Name of the queue/host/global for monitoring purposes. bool for_excl_request - match for exclusive request
u_long32 - The earliest time or DISPATCH_TIME_NOW.
MT-NOTE: utilization_below() is MT safe
utilization_max() -- Determine max utilization within timeframe
double utilization_max(const lListElem *cr, u_long32 start_time, u_long32 duration)
Determines the maximum utilization at the given timeframe.
const lListElem *cr - Resource utilization entry (RUE_utilized) u_long32 start_time - Start time of the timeframe u_long32 duration - Duration of timeframe bool for_excl_request - For exclusive request
double - Maximum utilization
MT-NOTE: utilization_max() is MT safe
utilization_print_to_dstring() -- Print resource utilization to dstring
bool utilization_print_to_dstring(const lListElem *this_elem, dstring *string)
Print resource utlilzation as plain number to dstring.
const lListElem *this_elem - A RUE_Type element dstring *string - The string
bool - error state true - success false - error
MT-NOTE: utilization_print_to_dstring() is MT safe
utilization_queue_end() -- Determine utilization at queue end time
double utilization_queue_end(const lListElem *cr)
Determine utilization at queue end time. Jobs that last until ever can cause a non-zero utilization.
const lListElem *cr - Resource utilization entry (RUE_utilized) bool for_excl_request - For exclusive request
double - queue end utilization
MT-NOTE: utilization_queue_end() is MT safe
sge_get_schedd_text() -- transformes a id into a info message
const char* sge_get_schedd_text(int nr)
transforms an id into an info message
int nr - info id
const char* - info message
MT-NOTE: sge_get_schedd_text() is MT safe
access_cq_rejected() -- Check, if cluster queue rejects user/project
static bool access_cq_rejected(const char *user, const char *group, const lList *acl_list, const lListElem *cq)
???
const char *user - Username const char *group - Groupname const lList *acl_list - List of access list definitions const lListElem *cq - Cluster queue
static bool - True, if rejected
MT-NOTE: access_cq_rejected() is MT safe
add_pe_slots_to_category() -- defines an array of valid slot values
static bool add_pe_slots_to_category(category_use_t *use_category, u_long32 *max_slotsp, lListElem *pe, int min_slots, int max_slots, lList *pe_range)
In case of pe ranges this function allocates memory and fills it with valid pe slot values. If a category is set, it stores them the category for further jobs.
category_use_t *use_category - category caching structure, must not be NULL u_long32 *max_slotsp - number of different slot settings lListElem *pe - pe, must not be NULL int min_slots - min slot setting (pe range) int max_slots - max slot setting (pe range) lList *pe_range - pe range, must not be NULL
static bool - true, if successful
MT-NOTE: add_pe_slots_to_category() is MT safe
clean_up_parallel_job() -- removes tags
static void clean_up_parallel_job(sge_assignment_t *a)
during pe job dispatch are man queues and hosts tagged. This function removes the tags.
sge_assignment_t *a - the resource structure
MT-NOTE: clean_up_parallel_job() is not MT safe
clear_resource_tags() -- removes the tags from a resource request.
static void clear_resource_tags(lList *resources, u_long32 max_tag)
Removes the tags from the given resource list. A tag is only removed if it is smaller or equal to the given tag value. The tag value "MAX_TAG" results in removing all existing tags, or the value "HOST_TAG" removes queue and host tags but keeps the global tags.
lList *resources - list of job requests. u_long32 max_tag - max tag element
compute_soft_violations() -- counts the violations in the request for a given host or queue
static int compute_soft_violations(lListElem *queue, int violation, lListElem *job,lList *load_attr, lList *config_attr, lList *actual_attr, lList *centry_list, u_long32 layer, double lc_factor, u_long32 tag)
this function checks if the current resources can satisfy the requests. The resources come from the global host, a given host or the queue. The function returns the number of violations.
const sge_assignment_t *a - job info structure lListElem *queue - should only be set, when one using this method on queue level int violation - the number of previous violations. This is needed to get a correct result on queue level. lList *load_attr - the load attributes, only when used on hosts or global lList *config_attr - a list of custom attributes (CE_Type) lList *actual_attr - a list of custom consumables, they contain the current usage of these attributes (RUE_Type) u_long32 layer - the current layer flag double lc_factor - should be set, when load correction has to be done. u_long32 tag - the current layer tag. (GLOGAL_TAG, HOST_TAG, QUEUE_TAG)
static int - the number of violations ( = (prev. violations) + (new violations in this run)).
cqueue_match_static() -- Does cluster queue match the job?
static dispatch_t cqueue_match_static(const char *cqname, sge_assignment_t *a)
The function tries to find reasons (-q, -l and -P) why the entire cluster is not suited for the job.
const char *cqname - Cluster queue name sge_assignment_t *a - ???
static dispatch_t - Returns DISPATCH_OK or DISPATCH_NEVER_CAT
MT-NOTE: cqueue_match_static() is MT safe
fill_category_use_t() -- fills the category_use_t structure.
void fill_category_use_t(sge_assignment_t *a, category_use_t *use_category, const char *pe_name)
If a cache structure for the given PE does not exist, it will generate the necessary data structures.
sge_assignment_t *a - job info structure (in) category_use_t *use_category - category info structure (out) const char* pe_name - the current pe name or "NONE"
MT-NOTE: fill_category_use_t() is MT safe
get_attribute() -- looks for an attribut, but only for one level (for host, global, or queue)
static lListElem* get_attribute(const char *attrname, lList *config_attr, lList *actual_attr, lList *load_attr, lList *centry_list, lListElem *queue, lListElem *rep, u_long32 layer, double lc_factor, dstring *reason)
Extracts the attribut specified with 'attrname' and finds the more important one, if it is defined multiple times on the same level. It only cares about one level. If the attribute is a consumable, one can specify a point in time and a duration. This will get the caller the min amount of that resource during the time frame.
const char *attrname - attribute name one is looking for lList *config_attr - user defined attributes (CE_Type) lList *actual_attr - current usage of consumables (RUE_Type) lList *load_attr - load attributes lList *centry_list - the system wide attribute configuration lListElem *queue - the current queue, or null, if one works on hosts u_long32 layer - the current layer double lc_factor - the load correction value dstring *reason - space for error messages or NULL bool zero_utilization - ??? u_long32 start_time - begin of the time interval, one asks for the resource u_long32 duration - the duration the interval
static lListElem* - the element one was looking for or NULL
get_attribute_by_Name() -- returns an attribut by name
void lListElem* get_attribute_by_Name(lListElem* global, lListElem *host, lListElem *queue, const char* attrname, lList *centry_list, char * reason, int reason_size)
It looks into the different configurations on host, global and queue and returns the attribute, which was asked for. It the attribut is defined multiple times, only the valid one is returned.
lListElem* global - the global host lListElem *host - a given host can be null, than only the global host is important lListElem *queue - a queue on the given host, can be null, than only the host and global ist important const char* attrname - the attribut name one is looking ofr lList *centry_list - the system wide attribut config list char *reason - memory for the error message int reason_size - the max length of an error message
void lListElem* - the element one is looking for (a copy) or NULL.
get_queue_resource() -- extracts attribut information from the queue
static lListElem* get_queue_resource(lListElem *queue, lList *centry_list, const char *attrname)
All fixed queue attributes are directly coded into the queue structure. These have to extraced and formed into a CE structure. That is, what this function does. It takes a name for an attribut and returns a full CE structure, if the attribut is set in the queue. Otherwise it returns NULL.
lListElem *queue_elem - lListElm *queue - const char *attrname - name of the attribute.
bool -
host_time_by_slots() -- Return time when host slots are available
int host_time_by_slots(int slots, u_long32 *start, u_long32 duration, int *host_soft_violations, lListElem *job, lListElem *ja_task, lListElem *hep, lList *centry_list, lList *acl_list)
The time when the specified slot amount is available at the host is determined. Behaviour depends on input/output parameter start DISPATCH_TIME_NOW 0 an assignment is possible now 1 no assignment now but later -1 assignment never possible for all jobs of the same category -2 assignment never possible for that particular job <any other time> 0 an assignment is possible at the specified time 1 no assignment at specified time but later -1 assignment never possible for all jobs of the same category -2 assignment never possible for that particular job DISPATCH_TIME_QUEUE_END 0 an assignment is possible and the start time is returned -1 assignment never possible for all jobs of the same category -2 assignment never possible for that particular job
int slots - ??? u_long32 *start - ??? u_long32 duration - ??? int *host_soft_violations - ??? lListElem *job - ??? lListElem *ja_task - ??? lListElem *hep - ??? lList *centry_list - ??? lList *acl_list - ???
interactive_cq_rejected() -- Check, if -now yes rejects cluster queue
static bool interactive_cq_rejected(const lListElem *cq)
Returns true if -now yes jobs can not be run in cluster queue
const lListElem *cq - cluster queue (CQ_Type)
static bool - True, if rejected
MT-NOTE: interactive_cq_rejected() is MT safe
is_attr_prior() -- compares two attribut instances with each other
static bool is_attr_prior(lListElem *upper_el, lListElem *lower_el)
checks if the first given attribut instance has a higher priority than second instance. if the first is NULL, it returns false if the second or the second and first is NULL, it returns true if the "==" or "!=" operators are used, it is true if both are the same, it may returns false. otherwise it computes the minimum or maximum between the values.
lListElem *upper_el - attribut, which should be overridden by the second one. lListElem *lower_el - attribut, which want to override the first one.
static bool - true, when the first attribut has a higher priority.
is_requested() -- Returns true if specified resource is requested.
bool is_requested(lList *req, const char *attr)
Returns true if specified resource is requested. Both long name and shortcut name are checked.
lList *req - The request list (CE_Type) const char *attr - The resource name.
bool - true if requested, otherwise false
MT-NOTE: is_requested() is MT safe
load_locate_elem() -- locates a consumable category in the given load list
static lListElem* load_locate_elem(lList *load_list, lListElem *global_consumable, lListElem *host_consumable, lListElem *queue_consumable)
lList *load_list - the load list to work on lListElem *global_consumable - a ref to the global consumable lListElem *host_consumable - a ref to the host consumable lListElem *queue_consumable - a ref to the queue consumable
static lListElem* - NULL, or the category element from the load list
MT-NOTE: load_locate_elem() is MT safe
load_np_value_adjustment() -- adjusts np load values for the number of processors
static int load_np_value_adjustment(const char* name, lListElem *hep, double *load_correction)
Tests the load value name for "np_*". If this pattern is found, it will retrieve the number of processors and adjusts the load_correction accordingly. If the pattern is not found, it does nothing and returns 0 for number of processors.
const char* name - load value name lListElem *hep - host object double *load_correction - current load_correction for further corrections
static int - number of processors, or 0 if it was called on a none np load value
MT-NOTE: load_np_value_adjustment() is MT safe
match_static_advance_reservation() -- Do matching that depends not on queue or host
static dispatch_t match_static_advance_reservation(const sge_assignment_t *a)
Checks whether a job that requests a advance reservation can be scheduled. The job can be scheduled if the advance reservation is in state "running".
const sge_assignment_t *a - assignment to match
static dispatch_t - DISPATCH_OK on success DISPATCH_NEVER_CAT on error
MT-NOTE: match_static_advance_reservation() is MT safe
parallel_assignment() -- Can we assign with a fixed PE/slot/time
int parallel_assignment(sge_assignment_t *assignment)
Returns if possible an assignment for a particular PE with a fixed slot at a fixed time.
sge_assignment_t *a - category_use_t *use_category - has information on how to use the job category
dispatch_t - 0 ok got an assignment 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: parallel_assignment() is not MT safe
parallel_available_slots() -- Check if number of PE slots is available
dispatch_t - 0 ok got an assignment 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category
MT-NOTE: parallel_available_slots() is not MT safe
parallel_host_slots() -- Return host slots available at time period
The maximum amount available at the host for the specified time period is determined.
parallel_tag_hosts_queues() -- Determine host slots and tag queue(s) accordingly
For a particular job the maximum number of slots that could be served at that host is determined in accordance with the allocation rule and returned. The time of the assignment can be either DISPATCH_TIME_NOW or a specific time, but never DISPATCH_TIME_QUEUE_END. In those cases when the allocation rule allows more than one slot be served per host it is necessary to also consider per queue possibly specified load thresholds. This is because load is global/per host concept while load thresholds are a queue attribute. In those cases when the allocation rule gives us neither a fixed amount of slots required nor an upper limit for the number per host slots (i.e. $fill_up and $round_robin) we must iterate through all slot numbers from 1 to the maximum number of slots "total_slots" and check with each slot amount whether we can get it or not. Iteration stops when we can't get more slots the host based on the queue limitations and load thresholds. As long as only one single queue at the host is eligible for the job the it is sufficient to check with each iteration whether the corresponding number of slots can be served by the host and it's queue or not. The really sick case however is when multiple queues are eligible for a host: Here we have to determine in each iteration step also the maximum number of slots each queue could get us by doing a per queue iteration from the 1 up to the maximum number of slots we're testing. The optimization in effect here is to check always only if we could get more slots than with the former per host slot amount iteration.
sge_assignment_t *a - lListElem *hep - current host lListElem *global - global host int *slots - out: # free slots int *slots_qend - out: # free slots in the far far future int global_soft_violations - # of global soft violations bool *master_host - out: if true, found a master host category_use_t *use_category - int/out : how to use the job category
static dispatch_t - 0 ok got an assignment 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: parallel_tag_hosts_queues() is not MT safe
parallel_tag_queues_suitable4job() -- Tag queues/hosts for a comprehensive/parallel assignment
static int parallel_tag_queues_suitable4job(sge_assignment_t *assignment)
We tag the number of available slots for that job at global, host and queue level under consideration of all constraints of the job. We also mark those queues that are suitable as a master queue as possible master queues and count the number of violations of the job's soft request. The method below is named comprehensive since it does the tagging game for the whole parallel job and under consideration of all available resources that could help to satisfy the job's request. This is necessary to prevent consumable resource limitation at host/global level multiple times. While tagging we also set queues QU_host_seq_no based on the sort order of each host. Assumption is the host list passed is sorted according to the load formula.
sge_assignment_t *assignment - ??? category_use_t use_category - information on how to use the job category
static dispatch_t - 0 ok got an assignment 1 no assignment at the specified time -2 assignment will never be possible for that particular job
MT-NOTE: parallel_tag_queues_suitable4job() is not MT safe
pe_cq_rejected() -- Check, if -pe pe_name rejects cluster queue
static bool pe_cq_rejected(const char *pe_name, const lListElem *cq)
Match a jobs -pe 'pe_name' with pe_list cluster queue configuration. True is returned if the parallel environment has no access.
const char *project - the pe request of a job (no wildcard) const lListElem *cq - cluster queue (CQ_Type)
static bool - True, if rejected
MT-NOTE: pe_cq_rejected() is MT safe
project_cq_rejected() -- Check, if -P project rejects cluster queue
static bool project_cq_rejected(const char *project, const lListElem *cq)
Match a jobs -P 'project' with project/xproject cluster queue configuration. True is returned if the project has no access.
const char *project - the project of a job or NULL const lListElem *cq - cluster queue (CQ_Type)
static bool - True, if rejected
MT-NOTE: project_cq_rejected() is MT safe
rc_time_by_slots() -- checks weather all resource requests on one level are fulfilled
static int rc_time_by_slots(lList *requested, lList *load_attr, lList *config_attr, lList *actual_attr, lList *centry_list, lListElem *queue, bool allow_non_requestable, char *reason, int reason_size, int slots, u_long32 layer, double lc_factor, u_long32 tag)
Checks, weather all requests, default requests and implicit requests on this this level are fulfilled. With reservation scheduling the earliest start time due to resources of the resource container is the maximum of the earliest start times for all resources comprised by the resource container that requested by a job.
lList *requested - list of attribute requests lList *load_attr - list of load attributes or null on queue level lList *config_attr - list of user defined attributes lList *actual_attr - usage of all consumables (RUE_Type) lList *centry_list - system wide attribute config. list (CE_Type) lListElem *queue - current queue or NULL on global/host level bool allow_non_requestable - allow none requestabales? char *reason - error message int reason_size - max error message size int slots - number of slots the job is looking for u_long32 layer - current layer flag double lc_factor - load correction factor u_long32 tag - current layer tag u_long32 *start_time - in/out argument for start time u_long32 duration - jobs estimated total run time const char *object_name - name of the object used for monitoring purposes
dispatch_t -
MT-NOTES: is not thread save. uses a static buffer Important: we have some special behavior, when slots is set to -1.
ri_slots_by_time() -- Determine number of slots avail. within time frame
static dispatch_t ri_slots_by_time(const sge_assignment_t *a, int *slots, int *slots_qend, lList *rue_list, lListElem *request, lList *load_attr, lList *total_list, lListElem *queue, u_long32 layer, double lc_factor, dstring *reason, bool allow_non_requestable, bool no_centry, const char *object_name)
The number of slots available with a resource can be zero for static resources or is determined based on maximum utilization within the specific time frame, the total amount of the resource and the per task request of the parallel job (ri_slots_by_time())
const sge_assignment_t *a - ??? int *slots - Returns maximum slots that can be served within the specified time frame. int *slots_qend - Returns the maximum possible number of slots lList *rue_list - Resource utilization (RUE_Type) lListElem *request - Job request (CE_Type) lList *load_attr - Load information for the resource lList *total_list - Total resource amount (CE_Type) lListElem *queue - Queue instance (QU_Type) for queue-based resources u_long32 layer - DOMINANT_LAYER_{GLOBAL|HOST|QUEUE} double lc_factor - load correction factor dstring *reason - diagnosis information if no rsrc available bool allow_non_requestable - ??? bool no_centry - ??? const char *object_name - ???
static dispatch_t -
MT-NOTE: ri_slots_by_time() is not MT safe
ri_time_by_slots() -- Determine availability time through slot number
int ri_time_by_slots(lListElem *rep, lList *load_attr, lList *config_attr, lList *actual_attr, lList *centry_list, lListElem *queue, char *reason, int reason_size, bool allow_non_requestable, int slots, u_long32 layer, double lc_factor)
Checks for one level, if one request is fulfilled or not. With reservation scheduling the earliest start time due to availability of the resource instance is determined by ensuring non-consumable resource requests are fulfilled or by finding the earliest time utilization of a consumable resource is below the threshold required for the request.
sge_assignment_t *a - assignment object that holds job specific scheduling relevant data lListElem *rep - requested attribute lList *load_attr - list of load attributes or null on queue level lList *config_attr - list of user defined attributes (CE_Type) lList *actual_attr - usage of user consumables (RUE_Type) lListElem *queue - the current queue, or null on host level dstring *reason - target for error message bool allow_non_requestable - allow none requestable attributes? int slots - the number of slots the job is looking for? u_long32 layer - the current layer double lc_factor - load correction factor u_long32 *start_time - in/out argument for start time const char *object_name - name of the object used for monitoring purposes
dispatch_t -
sequential_tag_queues_suitable4job() -- ???
The start time of a queue is always returned using the QU_available_at field. The overall behaviour of this function is somewhat dependent on the value that gets passed to assignment->start and whether soft requests were specified with the job: (1) In case of now assignments (DISPATCH_TIME_NOW) only the first queue suitable for jobs without soft requests is tagged. When soft requests are specified all queues must be verified and tagged in order to find the queue that fits best. (2) In case of reservation assignments (DISPATCH_TIME_QUEUE_END) the earliest time is searched when the resources of global/host/queue are sufficient for the job. The time-wise iteration is then done for each single resources instance. Actually there are cases when iterating through all queues were not needed: (a) if there was a global limitation search could stop once a queue is found that causes no further delay (b) if job has a soft request search could stop once a queue is found with minimum (=0) soft violations.
sge_assignment_t *assignment - job info structure
dispatch_t - 0 ok got an assignment start time(s) and slots are tagged 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: sequential_tag_queues_suitable4job() is not MT safe
sge_call_pe_qsort() -- call the Parallel Environment qsort plug-in
void sge_call_pe_qsort(sge_assignment_t *a, const char *qsort_args)
sge_assignment_t *a - PE assignment qsort_args - the PE qsort_args attribute
MT-NOTE: sge_call_pe_qsort() is not MT safe
sge_create_load_list() -- create the control structure for consumables as load thresholds
void sge_create_load_list(const lList *queue_list, const lList *host_list, const lList *centry_list, lList **load_list)
scans all queues for consumables as load thresholds. It builds a consumable category for each queue which is using consumables as a load threshold. If no consumables are used, the *load_list is set to NULL.
const lList *queue_list - a list of queue instances const lList *host_list - a list of hosts const lList *centry_list - a list of complex entries lList **load_list - a ref to the target load list
MT-NOTE: sge_create_load_list() is MT safe
sge_free_load_list() -- frees the load list and sets it to NULL
void sge_free_load_list(lList **load_list)
lList **load_list - the load list
MT-NOTE: sge_free_load_list() is MT safe
sge_host_match_static() -- Static test whether job fits to host
static int sge_host_match_static(lListElem *job, lListElem *ja_task, lListElem *host, lList *centry_list, lList *acl_list)
lListElem *job - ??? lListElem *ja_task - ??? lListElem *host - ??? lList *centry_list - ??? lList *acl_list - ???
int - 0 ok -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
sge_load_list_alarm() -- checks if queues went into an alarm state
bool sge_load_list_alarm(lList *load_list, const lList *host_list, const lList *centry_list)
The function uses the cull bitfield to identify modifications in one of the consumable elements. If the consumption has changed, the load for all queue referencing the consumable is recomputed. If a queue exceeds it load threshold, QU_tagged4schedule is set to 1.
lList *load_list - ??? const lList *host_list - ??? const lList *centry_list - ???
bool - true, if at least one queue was set into alarm state
MT-NOTE: sge_load_list_alarm() is MT safe
sge_queue_match_static() -- Do matching that depends not on time.
static int sge_queue_match_static(lListElem *queue, lListElem *job, const lListElem *pe, const lListElem *ckpt, lList *centry_list, lList *host_list, lList *acl_list)
Checks if a job fits on a queue or not. All checks that depend on the current load and resource situation must get handled outside. The queue also gets tagged in QU_tagged4schedule to indicate whether it is specified using -masterq queue_list.
lListElem *queue - The queue we're matching lListElem *job - The job const lListElem *pe - The PE object const lListElem *ckpt - The ckpt object lList *centry_list - The centry list lList *acl_list - The ACL list
dispatch_t - DISPATCH_OK, ok DISPATCH_NEVER_CAT, assignment will never be possible for all jobs of that category
sge_remove_queue_from_load_list() -- removes queues from the load list
void sge_remove_queue_from_load_list(lList **load_list, const lList *queue_list)
lList **load_list - load list structure const lList *queue_list - queues to be removed from it.
MT-NOTE: sge_remove_queue_from_load_list() is MT safe
sge_select_queue() -- checks whether a job matches a given queue or host
int sge_select_queue(lList *requested_attr, lListElem *queue, lListElem *host, lList *exechost_list, lList *centry_list, bool allow_non_requestable, int slots)
Takes the requested attributes from a job and checks if it matches the given host or queue. One and only one should be specified. If both, the function assumes, that the queue belongs to the given host.
lList *requested_attr - list of requested attributes lListElem *queue - current queue or null if host is set lListElem *host - current host or null if queue is set lList *exechost_list - list of all hosts in the system lList *centry_list - system wide attribute config list bool allow_non_requestable - allow non requestable? int slots - number of requested slots lList *queue_user_list - list of users or null lList *acl_list - acl_list or null lListElem *job - job or null
int - 1, if okay, QU_tag will be set if a queue is selected 0, if not okay
The caller is responsible for cleaning tags. No range is used. For serial jobs we will need a call for hard and one for soft requests. For parallel jobs we will call this function for each -l request. Because of in serial jobs requests can be simply added. In Parallel jobs each -l requests a different set of queues.
sge_sequential_assignment() -- Make an assignment for a sequential job.
int sge_sequential_assignment(sge_assignment_t *assignment)
For sequential job assignments all the earliest job start time is determined with each queue instance and the earliest one gets chosen. Secondary criterion for queue selection minimizing jobs soft requests. The overall behaviour of this function is somewhat dependent on the value that gets passed to assignment->start and whether soft requests were specified with the job: (1) In case of now assignments (DISPATCH_TIME_NOW) only the first queue suitable for jobs without soft requests is tagged. When soft requests are specified all queues must be verified and tagged in order to find the queue that fits best. On success the start time is set (2) In case of queue end assignments (DISPATCH_TIME_QUEUE_END)
sge_assignment_t *assignment - ???
int - 0 ok got an assignment + time (DISPATCH_TIME_NOW and DISPATCH_TIME_QUEUE_END) 1 no assignment at the specified time -1 assignment will never be possible for all jobs of that category -2 assignment will never be possible for that particular job
MT-NOTE: sge_sequential_assignment() is not MT safe
sge_split_queue_slots_free() -- ???
int sge_split_queue_slots_free(lList **free, lList **full)
Split queue list into queues with at least one slots and queues with less than one free slot. The list optionally returned in full gets the QNOSLOTS queue instance state set.
lList **free - Input queue instance list and return free slots. lList **full - If non-NULL the full queue instances get returned here.
int - 0 success -1 error
print_hdr() -- print a header for the sharetree dump
void print_hdr(dstring *out, const format_t *format)
Prints a header for data output using the sge_sharetree_print function.
dstring *out - dstring into which data will be written const format_t *format - format description
MT-NOTE: print_hdr() is MT-safe
sge_sharetree_print() -- dump sharetree information to a dstring
void sge_sharetree_print(dstring *out, lList *sharetree, lList *users, lList *projects, lList *config, bool group_nodes, bool decay_usage, const char **names, const format_t *format)
Dumps information about a sharetree into a given dstring. Information is appended. Outputs information like times, node (user/project) names, configured shares, actually received shares, targeted shares, usage information like cpu, memory and io. It is possible to restrict the number of fields that are output. Header information and formatting can be configured.
dstring *out - dstring into which data will be written lList *sharetree - the sharetree to dump lList *users - the user list lList *projects - the project list lList *config - the scheduler configuration list bool group_nodes - ??? bool decay_usage - ??? const char **names - fields to output const format_t *format - format description
MT-NOTE: sge_sharetree_print() is MT-safe
sge_do_urgency() -- Compute normalized urgency
void sge_do_urgency(u_long32 now, lList *running_jobs, lList *pending_jobs, sge_Sdescr_t *lists)
Determine normalized urgency for all job lists passed: * for the pending jobs we need it for determine dispatch order * for the running jobs it is needed when running jobs priority must be compared with pending jobs (preemption only)
u_long32 now - Current time lList *running_jobs - The running jobs list lList *pending_jobs - The pending jobs list sge_Sdescr_t *lists - Additional config information
sge_normalize_urgency() -- Computes normalized urgency for job list
static void sge_normalize_urgency(lList *job_list, double min_urgency, double max_urgency)
The normalized urgency is determined for a list of jobs based on the min/max urgency values passed and the JB_urg value of each job.
lList *job_list - The job list double min_urgency - minimum urgency value double max_urgency - maximum urgency value
MT-NOTES: sge_normalize_urgency() is MT safe
sge_normalize_value() -- Returns normalized value with passed value range
double sge_normalize_value(double value, double range_min, double range_max)
The value passed is normalized and resulting value (0.0-1.0) is returned The value range passed is assumed. In case there is no range because min/max are (nearly) equal 0.5 is returned.
double value - Value to be normalized. double range_min - Range minimum value. double range_max - Range maximum value.
double - Normalized value (0.0-1.0)
MT-NOTE: sge_normalize_value() is MT safe
sge_urgency() -- Determine urgency value for a list of jobs
static void sge_urgency(u_long32 now, double *min_urgency, double *max_urgency, lList *job_list, const lList *centry_list, const lList *pe_list)
The urgency value is determined for all jobs in job_list. The urgency value has two time dependent components (waiting time contribution and deadline contribution) and a resource request dependent component. Only resource requests that apply to the job irrespective what resources it gets assigned finally are considered. Default requests specified for consumable resources are not considered as they are placement dependent. For the same reason soft request do not contribute to the urgency value. The urgency value range is tracked via min/max urgency. Category-based caching is used for the resource request urgency contribution.
u_long32 now - Current time double *min_urgency - For tracking minimum urgency value double *max_urgency - For tracking minimum urgency value lList *job_list - The jobs. const lList *centry_list - Needed for per resource urgency setting. const lList *pe_list - Needed to determine urgency slot setting.
build_functional_categories() -- sorts the pending jobs into functional categories
void build_functional_categories(sge_ref_t *job_ref, int num_jobs, sge_fcategory_t **root, int dependent)
Generates a list of functional categories. Each category contains a list of jobs which belongs to this category. A functional category is assembled of: - job shares - user shares - department shares - project shares Alljobs with the same job, user,... shares are put in the same fcategory.
sge_ref_t *job_ref - array of pointers to the job reference structure int num_jobs - number of elements in the job_ref array sge_fcategory_t **root - root pointer to the functional category list sge_ref_list_t ** ref_array - has to be a pointer to NULL pointer. The memory will be allocated in this function and freed with free_fcategories. int dependent - does the functional tickets depend on prior computed tickets? u_long32 job_tickets - job field, which has the tickets ( JB>_jobshare, JB_override_tickets) u_long32 up_tickets - source for the user/department tickets/shares (UP_fshare, UP_otickets) u_long32 dp_tickets - source for the department tickets/shares (US_fshare, US_oticket)
u_long32 - number of jobs in the categories
- job classes are ignored. IMPROVEMENTS: - the stored values in the functional category structure can be used to speed up the ticket calculation. This will avoid unnecessary CULL accesses in the function calc_job_functional_tickets_pass1 - A further improvement can be done by: - limiting the job list length in each category to the max nr of jobs calculated - Sorting the jobs in each functional category by its job category. Each resulting job list can be of max size of open slots. This will result in a correct ftix result for all jobs, which might be scheduled.
???
calc_intern_pending_job_functional_tickets() -- calc ftix for pending jobs
void calc_intern_pending_job_functional_tickets(sge_fcategory_t *current, double sum_of_user_functional_shares, double sum_of_project_functional_shares, double sum_of_department_functional_shares, double sum_of_job_functional_shares, double total_functional_tickets, double weight[])
This is an optimized and incomplete version of calc_pending_job_functional_tickets. It is good enough to get the order right within the inner loop of the ftix calculation.
sge_fcategory_t *current - current fcategory double sum_of_user_functional_shares double sum_of_project_functional_shares double sum_of_department_functional_shares double sum_of_job_functional_shares double total_functional_tickets double weight[] - destribution of the shares to each other
be carefull using it
???
calculate_pending_shared_override_tickets() -- calculate shared override tickets
static void calculate_pending_shared_override_tickets(sge_ref_t *job_ref, int num_jobs, int dependent)
We calculate the override tickets for pending jobs, which are shared. The basic algorithm looks like this: do for each pending job do for each pending job which isn't yet considered active consider the job active calculate override tickets for that job consider the job not active end do consider the job with the highest priority (taking into account all previous polices + override tickets) as active end do set all pending jobs none active Since this algorithm is very expensive, we split all pending jobs into fcategories. The algorithm changes to: max_jobs = build fcategories and ignore jobs, which would get 0 override tickets do for max_jobs pending job do for each fcategory take take first job from category consider the job active calculate override tickets for that job consider the job not active store job with the most override tickets = job_max end do set job_max active and remove it from its fcategory. remove job_max fcategory, if job_max was the last job end; set all pending jobs none active That's it. It is very simillar to the functional ticket calculation, except, that we are working with tickts and not with shares.
sge_ref_t *job_ref - an array of job structures (first running, than pennding) int num_jobs - number of jobs in the array int dependent - do other ticket policies depend on this one?
MT-NOTE: calculate_pending_shared_override_tickets() is MT safe
copy_ftickets() -- copy the ftix from one job to an other one
void copy_ftickets(sge_ref_list_t *source, sge_ref_list_t *dest)
Copy the functional tickets and ref fields used for ftix calculation from one job to an other job.
sge_ref_list_t *source - source job sge_ref_list_t *dest - dest job
???
destribute_ftickets() -- ensures, that all jobs have ftix asoziated with them.
void destribute_ftickets(sge_fcategory_t *root, int dependent)
After the functional tickets are calculated, only the first job in the fcategory job list has ftix. This function copies the result from the first job to all other jobs in the same list and sums the job ticket count with the ftix.
sge_fcategory_t *root - fcategory list int dependent - does the final ticket count depend on ftix?
- This function is only needed, because not all functional tickets are calculated and to give a best guess result, all jobs in one category with no ftix get the same amount of ftix.
free_fcategories() -- frees all fcategories and their job lists.
void free_fcategories(sge_fcategory_t **fcategories)
frees all fcategories and their job lists.
sge_fcategory_t **fcategories /- pointer to a pointer of the first fcategory sge_ref_list_t **ref_array - memory for internal structures, allocated with build_functional_categories. Needs to be freed as well.
- it does not delete the sge_ref_t structures, which are stored in in the job lists.
recompute_prio() -- Recompute JAT prio based on changed ticket amount
static void recompute_prio(sge_task_ref_t *tref, lListElem *task, double nurg)
Each time when the ticket amount for in a JAT_Type element is changed the JAT_prio needs to be updated. The new ticket value is normalized and the priorty value is computed.
sge_task_ref_t *tref - The tref element that is related to the ticket change lListElem *task - The JAT_Type task element. double nurg - The normalized urgency assumed for the job. double npri - The normalized POSIX priority assumed for the job.
sge_build_sgeee_orders() -- build orders for updating qmaster
void sge_build_sgeee_orders(sge_Sdescr_t *lists, lList *running_jobs, lList *queued_jobs, lList *finished_jobs, order_t *orders, int update_usage_and_configuration, int seqno)
Builds generates the orderlist for sending the scheduling decisions to the qmaster. The following orders are generated: - running job tickets - pending job tickets - delete order for finished jobs - update user usage order - update project usage order - update share tree order - update scheduler configuration order - orders updating user/project resource usage (ORT_update_project_usage) - orders updating running tickets needed for dynamic repriorization (ORT_ticket) Most orders are generated by using the sge_create_orders function.
sge_Sdescr_t *lists - ??? lList *running_jobs - list of running jobs lList *queued_jobs - list of queued jobs (should be sorted by ticktes) lList *finished_jobs - list of finished jobs order_t *orders - existing order list (new orders will be added to it bool update_usage_and_configuration - if true, the update usage orders are generated int seqno - a seqno, changed with each scheduling run bool max_queued_ticket_orders - if true, pending tickets are submited to the qmaster bool updated_execd - if true, the queue information is send with the running job tickets
void
sge_do_sgeee_priority() -- determine GEEE priority for a list of jobs
static void sge_do_sgeee_priority(lList *job_list, double min_tix, double max_tix)
Determines for a list of jobs the GEEE priority. Prior sge_do_sgeee_priority() can be called the normalized urgency value must already be known for each job. The ticket range passed is used for normalizing ticket amount.
lList *job_list - The job list double min_tix - Minumum ticket amount double max_tix - Maximum ticket amount bool do_nprio - Needs norm. priority be determined bool do_nurg - Needs norm. urgency be determined
MT-NOTE: sge_do_sgeee_priority() is MT safe
sgeee_priority() -- Compute final GE priority
static void sgeee_priority(lListElem *task, u_long32 jobid, double nsu, double min_tix, double max_tix)
The GE priority is computed for the task based on the already known ticket amount and already normalized urgency value. The ticket amount is normalized based on the ticket range passed. The weights for ticket and urgency value are applied.
lListElem *task - The task whose priority is computed u_long32 jobid - The jobs id double nsu - The normalized urgency value that applies to all tasks of the job. double min_tix - minimum ticket amount double max_tix - maximum ticket amount
MT-NOTE: sgeee_priority() is MT safe
sgeee_resort_pending_jobs() -- Resort pending jobs after assignment
void sgeee_resort_pending_jobs(lList **job_list, lList *orderlist)
Update pending jobs order upon assignement and change ticket amounts in orders previously created. If we dispatch a job sub-task and the job has more sub-tasks, then the job is still first in the job list. We need to remove and reinsert the job back into the sorted job list in case another job is higher priority (i.e. has more tickets) Additionally it is neccessary to update the number of pending tickets for the following pending array task. (The next task will get less tickets than the current one)
lList **job_list - The pending job list. The first job in the list was assigned right before.
sgeee_scheduler() -- calc tickets, send orders, and sort job list
int sgeee_scheduler(sge_Sdescr_t *lists, lList *running_jobs, lList *finished_jobs, lList *pending_jobs, lList **orderlist)
- calculates the running and pending job tickets. - send the orders to the qmaster about the job tickets - order the pending job list according the the job tickets On a "normal" scheduling interval: - calculate tickets for new and running jobs - don't decay and sum usage - don't update qmaster On a scheduling interval: - calculate tickets for new and running jobs - decay and sum usage - handle finished jobs - update qmaster
sge_Sdescr_t *lists - a ref to all lists in this scheduler lList *running_jobs - a list of all running jobs lList *finished_jobs - a list of all finished jobs lList *pending_jobs - a list of all pending jobs lList **orderlist - the order list
int - 0 if everthing went fine, -1 if not
tix_range_get() -- Get stored ticket range.
static void tix_range_get(double *min_tix, double *max_tix)
Get stored ticket range from global variables.
double *min_tix - Target for minimum value. double *max_tix - Target for maximum value.
MT-NOTES: tix_range_get() is not MT safe
tix_range_set() -- Store ticket range.
static void tix_range_set(double min_tix, double max_tix)
Stores ticket range in the global variables.
double min_tix - Minimum ticket value. double max_tix - Maximum ticket value.
MT-NOTES: tix_range_set() is not MT safe
sge_ar_queue_have_users_access() -- verify that all users of an AR have queue access
bool sge_ar_queue_have_users_access(lList **alpp, lListElem *ar, lListElem *queue, lList *master_userset_list)
Iterates over the AR_acl_list and proves that every entry has queue access. If only one has no access the function returns false
lList **alpp - answer list lListElem *ar - advance reservation object (AR_Type) lListElem *queue - queue instance object (QU_Type) lList *master_userset_list - master userset list
bool - true if all have access false if only one has no access
MT-NOTE: sge_ar_queue_have_users_access() is MT safe
--Simple-Scheduler-Interface
: schedlib ssi --Simple-Scheduler-Interface-SERF_Implementation
: SERF -SERF_Implementation-SERF_Interface
: SERF -SERF_Interface-Simple-Scheduler-Interface-Typedefs
: schedlib ssi -Simple-Scheduler-Interface-Typedefsaccess_cq_rejected
: sge_select_queue access_cq_rejectedadd_calendar_to_schedule
: sge_resource_utilization add_calendar_to_scheduleadd_job_utilization
: sge_resource_utilization add_job_utilizationadd_pe_slots_to_category
: sge_select_queue add_pe_slots_to_categorybuild_functional_categories
: sgeee build_functional_categoriesbuild_name_filter
: sge_complex_schedd build_name_filtercalc_intern_pending_job_functional_tickets
: sgeee calc_intern_pending_job_functional_ticketscalculate_pending_shared_override_tickets
: sgeee calculate_pending_shared_override_ticketscheck_and_debit_rqs_slots
: sge_resource_quota_schedd check_and_debit_rqs_slotsclean_up_parallel_job
: sge_select_queue clean_up_parallel_jobclear_resource_tags
: sge_select_queue clear_resource_tagscompute_soft_violations
: sge_select_queue compute_soft_violationscopy_ftickets
: sgeee copy_fticketscqueue_match_static
: sge_select_queue cqueue_match_staticcqueue_shadowed
: sge_resource_quota_schedd cqueue_shadowedcqueue_shadowed_by
: sge_resource_quota_schedd cqueue_shadowed_bydebit_job_from_rqs
: sge_resource_quota_schedd debit_job_from_rqsdestribute_ftickets
: sgeee destribute_fticketsfill_category_use_t
: sge_select_queue fill_category_use_tfree_fcategories
: sgeee free_fcategoriesget_attribute
: sge_select_queue get_attributeget_attribute_by_Name
: sge_select_queue get_attribute_by_Nameget_attribute_list
: sge_complex_schedd get_attribute_listget_attribute_list_by_names
: sge_complex_schedd get_attribute_list_by_namesget_name_of_split_value
: sched sge_job_schedd get_name_of_split_valueget_queue_resource
: sge_select_queue get_queue_resourcehost_shadowed
: sge_resource_quota_schedd host_shadowedhost_shadowed_by
: sge_resource_quota_schedd host_shadowed_byhost_time_by_slots
: sge_select_queue host_time_by_slotsinteractive_cq_rejected
: sge_select_queue interactive_cq_rejectedis_attr_prior
: sge_select_queue is_attr_prioris_attr_prior2
: sge_complex_schedd is_attr_prior2is_cqueue_expand
: sge_resource_quota_schedd is_cqueue_expandis_cqueue_global
: sge_resource_quota_schedd is_cqueue_globalis_host_expand
: sge_resource_quota_schedd is_host_expandis_host_global
: sge_resource_quota_schedd is_host_globalis_requested
: sge_select_queue is_requestedjob_get_duration
: sched sge_job_schedd job_get_durationjob_lists_split_with_reference_to_max_running
: sched sge_job_schedd job_lists_split_with_reference_to_max_runningjob_move_first_pending_to_running
: sched sge_job_schedd job_move_first_pending_to_runningload_locate_elem
: sge_select_queue load_locate_elemload_np_value_adjustment
: sge_select_queue load_np_value_adjustmentmatch_static_advance_reservation
: sge_select_queue match_static_advance_reservationnewResourceElem
: sge_resource_utilization newResourceElemorder_remove_immediate
: SCHEDD order_remove_immediateorder_remove_order_and_immediate
: SCHEDD order_remove_order_and_immediateparallel_assignment
: sge_select_queue parallel_assignmentparallel_available_slots
: sge_select_queue parallel_available_slotsparallel_global_slots
: sched select_queue parallel_global_slotsparallel_host_slots
: sge_select_queue parallel_host_slotsparallel_limit_slots_by_time
: sge_resource_quota_schedd parallel_limit_slots_by_timeparallel_maximize_slots_pe
: scheduler parallel_maximize_slots_peparallel_queue_slots
: sched select_queue parallel_queue_slotsparallel_reservation_max_time_slots
: scheduler parallel_reservation_max_time_slotsparallel_rqs_slots_by_time
: sge_resource_quota_schedd parallel_rqs_slots_by_timeparallel_tag_hosts_queues
: sge_select_queue parallel_tag_hosts_queuesparallel_tag_queues_suitable4job
: sge_select_queue parallel_tag_queues_suitable4jobpe_cq_rejected
: sge_select_queue pe_cq_rejectedpe_match_static
: sge_pe_schedd pe_match_staticprepare_resource_schedules
: sge_resource_utilization prepare_resource_schedulesprint_hdr
: sge_sharetree_printing print_hdrproject_cq_rejected
: sge_select_queue project_cq_rejectedrc_time_by_slots
: sge_select_queue rc_time_by_slotsrecompute_prio
: sgeee recompute_prioremove_immediate_job
: SCHEDD remove_immediate_jobremove_immediate_jobs
: SCHEDD remove_immediate_jobsrequest_cq_rejected
: sge_complex_schedd request_cq_rejectedri_slots_by_time
: sge_select_queue ri_slots_by_timeri_time_by_slots
: sge_select_queue ri_time_by_slotsrqs_add_job_utilization
: sge_resource_utilization rqs_add_job_utilizationrqs_by_slots
: sge_resource_quota_schedd rqs_by_slotsrqs_can_optimize
: sge_resource_quota_schedd rqs_can_optimizerqs_exceeded_sort_out
: sge_resource_quota_schedd rqs_exceeded_sort_outrqs_exceeded_sort_out_par
: sge_resource_quota_schedd rqs_exceeded_sort_out_parrqs_excluded_cqueues
: sge_resource_quota_schedd rqs_excluded_cqueuesrqs_excluded_hosts
: sge_resource_quota_schedd rqs_excluded_hostsrqs_expand_cqueues
: sge_resource_quota_schedd rqs_expand_cqueuesrqs_expand_hosts
: sge_resource_quota_schedd rqs_expand_hostsrqs_limitation_reached
: sge_resource_quota_schedd rqs_limitation_reachedrqs_match_assignment
: sge_resource_quota_schedd rqs_match_assignmentrqs_set_dynamical_limit
: sge_resource_quota_schedd rqs_set_dynamical_limitschedd_mes_add
: schedd schedd_mes schedd_mes_addschedd_mes_add_global
: schedd schedd_mes schedd_mes_add_globalschedd_mes_add_join
: schedd_message schedd_mes_add_joinschedd_mes_commit
: schedd schedd_mes schedd_mes_commitschedd_mes_get_tmp_list
: schedd_message schedd_mes_get_tmp_listschedd_mes_initialize
: schedd schedd_mes schedd_mes_initializeschedd_mes_obtain_package
: schedd schedd_mes schedd_mes_obtain_packageschedd_mes_rollback
: schedd schedd_mes schedd_mes_rollbackschedd_mes_set_tmp_list
: schedd_message schedd_mes_set_tmp_listsequential_global_time
: sched select_queue sequential_global_timesequential_queue_time
: sched select_queue sequential_queue_timesequential_tag_queues_suitable4job
: sge_select_queue sequential_tag_queues_suitable4jobserf_exit
: sge_resource_utilization serf_exitserf_init
: sge_resource_utilization serf_initserf_new_interval
: sge_resource_utilization serf_new_intervalserf_record_entry
: sge_resource_utilization serf_record_entryset_utilization
: sge_resource_utilization set_utilizationsge_add_schedd_info
: sge_orders sge_add_schedd_infosge_ar_queue_have_users_access
: valid_queue_user sge_ar_queue_have_users_accesssge_build_sgeee_orders
: sgeee sge_build_sgeee_orderssge_call_pe_qsort
: sge_select_queue sge_call_pe_qsortsge_create_load_list
: sge_select_queue sge_create_load_listsge_create_orders
: sge_orders sge_create_orderssge_dlib
: sge_dlibsge_do_sgeee_priority
: sgeee sge_do_sgeee_prioritysge_do_urgency
: sge_urgency sge_do_urgencysge_free_load_list
: sge_select_queue sge_free_load_listsge_get_schedd_text
: sge_schedd_text sge_get_schedd_textsge_GetNumberOfOrders
: sge_orders sge_GetNumberOfOrderssge_host_match_static
: sge_select_queue sge_host_match_staticsge_job_slot_request
: sge_job_schedd sge_job_slot_requestsge_join_orders
: sge_orders sge_join_orderssge_load_list_alarm
: sge_select_queue sge_load_list_alarmsge_normalize_urgency
: sge_urgency sge_normalize_urgencysge_normalize_value
: sge_urgency sge_normalize_valuesge_qeti_first
: sge_resource_utilization sge_qeti_firstsge_qeti_list_add
: sge_qeti sge_qeti_list_addsge_qeti_next
: sge_resource_utilization sge_qeti_nextsge_qeti_next_before
: sge_qeti sge_qeti_next_beforesge_qeti_release
: sge_resource_utilization sge_qeti_releasesge_queue_match_static
: sge_select_queue sge_queue_match_staticsge_remove_queue_from_load_list
: sge_select_queue sge_remove_queue_from_load_listsge_select_parallel_environment
: scheduler sge_select_parallel_environmentsge_select_queue
: sge_select_queue sge_select_queuesge_sequential_assignment
: sge_select_queue sge_sequential_assignmentsge_sharetree_print
: sge_sharetree_printing sge_sharetree_printsge_split_queue_slots_free
: sge_select_queue sge_split_queue_slots_freesge_ssi_job_cancel
: schedlib ssi sge_ssi_job_cancelsge_ssi_job_start
: schedlib ssi sge_ssi_job_startsge_urgency
: sge_urgency sge_urgencysge_user_is_referenced_in_rqs
: sge_resource_quota_schedd sge_user_is_referenced_in_rqssgeee_priority
: sgeee sgeee_prioritysgeee_resort_pending_jobs
: sgeee sgeee_resort_pending_jobssgeee_scheduler
: sgeee sgeee_schedulerSPLIT_-Constants
: sched sge_job_schedd SPLIT_-Constantssplit_jobs
: sched sge_job_schedd split_jobstask_get_duration
: sge_job_schedd task_get_durationtix_range_get
: sgeee tix_range_gettix_range_set
: sgeee tix_range_settrash_splitted_jobs
: sched sge_job_schedd trash_splitted_jobsuser_list_init_jc
: sched sge_job_schedd user_list_init_jcutilization_add
: sge_resource_utilization utilization_addutilization_below
: sge_resource_utilization utilization_belowutilization_max
: sge_resource_utilization utilization_maxutilization_print_to_dstring
: sge_resource_utilization utilization_print_to_dstringutilization_queue_end
: sge_resource_utilization utilization_queue_end