order_remove_immediate() -- add a remove order for the job task
int order_remove_immediate(lListElem *job, lListElem *ja_task,
Generates an order of type ORT_remove_immediate_job for the given job
task.
lListElem *job - The job to remove (JB_Type)
lListElem *ja_task - The task to remove (JAT_Type)
order_t *orders - The order structure will be extended by one del order
int - Error code: 0 = OK, 1 = Errors
MT-NOTE: order_remove_immediate() is MT safe
order_remove_order_and_immediate() -- add a remove order for the job task
int order_remove_order_and_immediate(lListElem *job, lListElem *ja_task,
Generates an order of type ORT_remove_immediate_job for the given job
task. Also removes the ORT_start_job order for this task from the order
list.
lListElem *job - The job to remove (JB_Type)
lListElem *ja_task - The task to remove (JAT_Type)
order_t *orders - The order structurestructure for this scheduler pass be removed
int - Error code: 0 = OK, 1 = Errors
MT-NOTE: order_remove_order_and_immediate() is MT safe
remove_immediate_job() -- test for and remove immediate job which can't
be scheduled
int remove_immediate_job(lList *job_list, lListElem *job, order_t *orders,
Removes immediate jobs which cannot be scheduled from the given job list.
This is done by generating an order of type ORT_remove_immediate_job. If
remove_orders is set, the ORT_start_job orders are first removed from the
order list before adding the remove order.
lList *job_list - The list of jobs from which the job should be
removed (JB_Type)
lListElem *job - The job to remove (JB_Type)
order_t *orders - The order structure for this scheduler pass
int remove_orders - Whether the ORT_start_job orders should also be
be removed
MT-NOTE: remove_immediate_job() is MT safe
remove_immediate_jobs() -- test for and remove immediate jobs which can't
be scheduled
int remove_immediate_jobs(lList *pending_job_list,
Goes through all jobs in the pending list to see if any are immediate and
not idle. If any are, they are removed. This is done by generating an
order of type ORT_remove_immediate_job. If any array jobs are removed,
the running list is checked for tasks belonging to the job, which are
also removed. This is done by removing the ORT_start_job orders and
adding an order of type ORT_remove_immediate_job.
lList *pending_job_list - The list of pending jobs for this scheduler
pass (JB_Type)
lList *running_job_list - The list of running jobs for this scheduler
pass (JB_Type)
order_t *orders - The order structure for this scheduler pass
int - Error code: 0 = OK, 1 = Errors -- always returns 0
MT-NOTE: remove_immediate_jobs() is MT safe
SERF_Implementation -- Functions that implement a generic schedule
entry recording facility (SERF)
SERF -- Schedule entry recording facility
The enlisted functions below allow for plugging in any module that
records schedule entries be used that registers through sge_serf_init()
the following methods:
typedef void (*record_schedule_entry_func_t)(
u_long32 job_id,
u_long32 ja_taskid,
const char *state,
u_long32 start_time,
u_long32 end_time,
char level_char,
const char *object_name,
const char *name,
double utilization);
typedef void (*new_schedule_func_t)(u_long32 time);
parallel_global_slots() --
dispatch_t - 0 ok got an assignment + set time for DISPATCH_TIME_QUEUE_END
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
parallel_queue_slots() --
int - 0 ok got an assignment + set time for DISPATCH_TIME_NOW and
DISPATCH_TIME_QUEUE_END (only with fixed_slot equals true)
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
sequential_global_time() --
int - 0 ok got an assignment + set time for DISPATCH_TIME_QUEUE_END
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
sequential_queue_time() --
dispatch_t - 0 ok got an assignment + set time for DISPATCH_TIME_NOW and
DISPATCH_TIME_QUEUE_END (only with fixed_slot equals true)
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
SPLIT_-Constants -- Constants used for split_jobs()
enum {
SPLIT_FIRST,
SPLIT_PENDING = SPLIT_FIRST,
SPLIT_PENDING_EXCLUDED,
SPLIT_PENDING_EXCLUDED_INSTANCES,
SPLIT_SUSPENDED,
SPLIT_WAITING_DUE_TO_PREDECESSOR,
SPLIT_HOLD,
SPLIT_ERROR,
SPLIT_WAITING_DUE_TO_TIME,
SPLIT_RUNNING,
SPLIT_FINISHED,
SPLIT_LAST
};
SPLIT_PENDING - Pending jobs/tasks which may be dispatched
SPLIT_PENDING_EXCLUDED - Pending jobs/tasks which won't
be dispatched because this whould exceed
'max_u_jobs'
SPLIT_PENDING_EXCLUDED_INSTANCES - Pending jobs/tasks which
won't be dispatched because this whould
exceed 'max_aj_instances'
SPLIT_SUSPENDED - Suspended jobs/tasks
SPLIT_WAITING_DUE_TO_PREDECESSOR - Jobs/Tasks waiting for
others to finish
SPLIT_HOLD - Jobs/Tasks in user/operator/system hold
SPLIT_ERROR - Jobs/Tasks which are in error state
SPLIT_WAITING_DUE_TO_TIME - These jobs/tasks are not
dispatched because start time is in future
SPLIT_RUNNING - These Jobs/Tasks won't be dispatched
because they are already running
SPLIT_FINISHED - Already finished jobs/tasks
SPLIT_NOT_STARTED - jobs that could not be dispatched in one scheduling
run
SPLIT_FIRST and SPLIT_LAST might be used to build loops.
get_name_of_split_value() -- Constant to name transformation
const char* get_name_of_split_value(int value)
This function transforms a constant value in its internal
name. (Used for debug output)
int value - SPLIT_-Constant
const char* - string representation of 'value'
job_get_duration() -- Determine a jobs runtime duration
bool job_get_duration(u_long32 *duration, const lListElem *jep)
The minimum of the time values the user specified with -l h_rt=<time>
and -l s_rt=<time> is returned in 'duration'. If neither of these
time values were specified the default duration is used.
u_long32 *duration - Returns duration on success
const lListElem *jep - The job (JB_Type)
bool - true on success
MT-NOTE: job_get_duration() is MT safe
job_lists_split_with_reference_to_max_running()
void job_lists_split_with_reference_to_max_running(
lList **job_lists[],
lList **user_list,
const char* user_name,
int max_jobs_per_user)
Move those jobs which would exceed the configured
'max_u_jobs' limit (schedd configuration) from
job_lists[SPLIT_PENDING] into job_lists[SPLIT_PENDING_EXCLUDED].
Only the jobs of the given 'user_name' will be handled. If
'user_name' is NULL than all jobs will be handled whose job owner
is mentioned in 'user_list'.
lList **job_lists[] - Array of JB_Type lists
lList **user_list - User list of Type JC_Type
const char* user_name - user name
int max_jobs_per_user - "max_u_jobs"
JC_jobs of the user elements contained in "user_list" has to be
initialized properly before this function might be called.
job_move_first_pending_to_running() -- Move a job
void job_move_first_pending_to_running(lListElem **pending_job,
lList **splitted_jobs[])
Move the 'pending_job' from 'splitted_jobs[SPLIT_PENDING]'
into 'splitted_jobs[SPLIT_RUNNING]'. If 'pending_job' is an
array job, than the first task (task id) will be moved into
'pending_job[SPLIT_RUNNING]'
lListElem **pending_job - Pointer to a pending job (JB_Type)
lList **splitted_jobs[] - (JB_Type) array of job lists
bool - true, if the pending job was removed
split_jobs() -- Split list of jobs according to their state
void split_jobs(lList **job_list, lList **answer_list,
u_long32 max_aj_instances,
lList **result_list[])
Split a list of jobs according to their state.
'job_list' is the input list of jobs. The jobs in this list
have different job states. For the dispatch algorithm only
those jobs are of interest which are really pending. Jobs
which are pending and in error state or jobs which have a
hold applied (start time in future, administrator hold, ...)
are not necessary for the dispatch algorithm.
After a call to this function the jobs of 'job_list' may
have been moved into one of the 'result_list's.
Each of those lists containes jobs which have a certain state.
(e.g. result_list[SPLIT_WAITING_DUE_TO_TIME] will contain
all jobs which have to wait according to their start time.
'max_aj_instances' are the maximum number of tasks of an
array job which may be instantiated at the same time.
'max_aj_instances' is used for the split decitions.
In case of any error the 'answer_list' will be used to report
errors (It is not used in the moment)
lList **job_list - JB_Type input list
u_long32 max_aj_instances - max. num. of task instances
lList **result_list[] - Array of result list (JB_Type)
In former versions of SGE/EE we had 8 split functions.
Each of those functions walked twice over the job list.
This was time consuming in case of x thousand of jobs.
We tried to improve this:
- loop over all jobs only once
- minimize copy operations where possible
Unfortunately this function is heavy to understand now. Sorry!
trash_splitted_jobs() -- Trash all not needed job lists
void trash_splitted_jobs(lList **splitted_job_lists[])
Trash all job lists which are not needed for scheduling decisions.
Before jobs and lists are trashed, scheduling messages will
be generated.
Following lists will be trashed:
splitted_job_lists[SPLIT_ERROR]
splitted_job_lists[SPLIT_HOLD]
splitted_job_lists[SPLIT_WAITING_DUE_TO_TIME]
splitted_job_lists[SPLIT_WAITING_DUE_TO_PREDECESSOR]
splitted_job_lists[SPLIT_PENDING_EXCLUDED_INSTANCES]
splitted_job_lists[SPLIT_PENDING_EXCLUDED]
lList **splitted_job_lists[] - list of job lists
user_list_init_jc() -- inc. the # of jobs a user has running
void user_list_init_jc(lList **user_list,
const lList *running_list)
Initialize "user_list" and JC_jobs attribute for each user according
to the list of running jobs.
lList **user_list - JC_Type list
const lList *running_list - JB_Type list
void - None
schedd_mes_add() -- Add one entry into the message structure.
void schedd_mes_add(u_long32 job_number,
u_long32 message_number,
...)
During the time the scheduler trys to dispatch jobs it might
call this function to add messages into a temporary structure.
This function might be called several times. Each call
will add one element which containes one message describing
a reason, why a job can't be dispatched and the concerned jid.
When it is clear if the job could be dispatched or not, one of
following functions has to be called:
schedd_mes_commit()
schedd_mes_rollback()
u_long32 job_number - job id
u_long32 message_number - message number (sge_schedd_text.h)
... - arguments for format string
sge_schedd_text(message_number)
MT-NOTE: schedd_mes_add() is MT safe
schedd_mes_add_global() -- add a global message
void schedd_mes_add_global(u_long32 message_number, ...)
Add a global message into a message structure.
u_long32 message_number - message number (sge_schedd_text.h)
... - arguments for format string
sge_schedd_text(message_number)
MT-NOTE: schedd_mes_add_global() is MT safe
schedd_mes_commit() -- Complete message elements and move them
void schedd_mes_commit(lList *job_list, int ignore_category)
Each message contained in "tmp_sme" containes only
one job id. We have to find other jobs in "job_list" and
add the job ids to the list of ids contained in "tmp_sme"
message elements. After that we have to move all messages
contained in "tmp_sme" into "sme".
If "ignore_category" is 1 than the job category will be ignored.
This means thal all ids of "job_list" will be added to all
messages contained in "tmp_sme".
If no category is passed in and ignore_category is false, the messages
are only generated for the current job, meaning, they are just copied.
lList *job_list - JB_Type list
int ignore_category - if set to true, the messages with be generated for all jobs
in the list
lRef jid_category - if not NULL, the function uses the category to ensure, that
every message is only added per category once.
schedd_mes_initialize() -- Initialize module variables
void schedd_mes_initialize(void)
Initialize module variables
schedd_mes_obtain_package() -- Get message structure
lListElem *schedd_mes_obtain_packagevoid)
Returns message structure which containes all messages.
int *global_mes_count - out: returns nr of global messages
int *job_mes_count - out: returns nr of job messages
The calling function is responsible to free the returned
message structure if it is not needed anymore.
lListElem* - SME_Type element
schedd_mes_rollback() -- Free temporaryly generated messages
void schedd_mes_rollback(void)
Free temporaryly generated messages contained in "tmp_sme".
schedd_mes_add_join() -- same as schedd_mes_add, but joins messages based
on the message id.
void schedd_mes_add_join(u_long32 job_number, u_long32 message_number,
...)
same as schedd_mes_add, but joins messages based
on the message id. But it only uses the temp message
list and not the global one.
u_long32 job_number - job id
u_long32 message_number - message number (sge_schedd_text.h)
... - arguments for format string
sge_schedd_text(message_number)
MT-NOTE: schedd_mes_add_join() is MT safe
schedd_mes_get_tmp_list() -- gets all messages for the current job
lList* schedd_mes_get_tmp_list()
returns a list of all messages for the current job
lList* - message list
schedd_mes_set_tmp_list() -- sets the messages for a current job
void schedd_mes_set_tmp_list(lListElem *category, int name, int name, u_long32 job_number)
Takes a mesage list, changes the job number to the current job and stores
the list.
lListElem *category - an object, which stores the list
int name - element id for the list
u_long32 job_number - job number
Simple-Scheduler-Interface -- Interface for custom schedulers
SGE provides a very simple interface to custom schedulers.
Such scheduler can be created using the event client or the
event mirror interface.
The interface provides functions to start a job and to
delete a job.
It was created to allow an easier integration of the MAUI scheduler
into Grid Engine.
-Simple-Scheduler-Interface-Typedefs -- typedefs for the SSI
typedef struct {
int procs;
const char *host_name;
} task_map;
With a task_map a jobs structure is described.
A job can be spawned over an arbitrary number of hosts.
A job has an arbitrary number of tasks per host.
An array of task_map is used to pass information to ssi functions.
It can contain any number of entries, the last entry has to contain
0 as procs.
sge_ssi_job_cancel() -- delete or restart a job
bool sge_ssi_job_cancel(const char *job_identifier, bool reschedule)
Delete the given job.
If reschedule is set to true, reschedule the job.
const char *job_identifier - job identifier in the form
<jobid>.<ja_task_id>, e.g. 123.1
bool reschedule - if true, reschedule job
bool - true, if the job could be successfully deleted (rescheduled),
else false.
The reschedule parameter is igored in the current implementation.
sge_ssi_job_start() -- start a job
bool sge_ssi_job_start(const char *job_identifier, const char *pe,
task_map tasks[])
Start the job described by job_identifier, pe and tasks.
job_identifier has to be given in the form "<job_id>.<ja_task_id>",
e.g. "123.1" and must reference a pending job/array task.
For parallel jobs, pe has to be the name of an existing parallel
environment.
tasks describes how many tasks are to be started per host.
The function creates a scheduling order and sends it to qmaster.
const char *job_identifier - unique job identifier
const char *pe - name of a parallel environment
or NULL for sequential jobs
task_map tasks[] - mapping host->number of tasks
bool - true on success, else false
parallel_maximize_slots_pe() -- Maximize number of slots for an assignment
static int parallel_maximize_slots_pe(sge_assignment_t *best, lList *host_list,
lList *queue_list, lList *centry_list, lList *acl_list)
The largest possible slot amount is searched for a job assuming a
particular parallel environment is used at a particular start time.
If the slot number passed is 0 we start with the minimum
possible slot number for that job.
To search most efficiently for the right slot value, it has three search
strategies implemented:
- binary search
- least slot value first
- highest slot value first
To be able to use binary search all possible slot values are stored in
one array. The slot values in this array are sorted ascending. After the
right slot value is found, it is very easy to compute the best strategy
from the result. For each strategy it will compute how many iterations
would have been needed to compute the correct result. These steps will
be stored for the next run and used to figure out the best algorithm.
To ensure that we can adapt to rapid changes and also ignore spikes we
are using the running average algorithm in a 80-20 setting. This means
that the algorithm will need 4 (max 5) iterations to adopt to a new
scenario.
Further enhancements:
It might be a good idea to store the derived values with the job categories
and allow finding the best strategy per category.
sge_assignment_t *best - herein we keep all important in/out information
lList *host_list - a list of all available hosts
lList *queue_list - a list of all available queues
lList *centry_list - a list of all available complex attributes
lList *acl_list - a list of all access lists
int - 0 ok got an assignment (maybe without maximizing it)
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: parallel_maximize_slots_pe() is MT safe as long as the provided
lists are owned be the caller
SEE ALSO:
sconf_best_pe_alg
sconf_update_pe_alg
add_pe_slots_to_category
parallel_reservation_max_time_slots() -- Search earliest possible assignment
static dispatch_t parallel_reservation_max_time_slots(sge_assignment_t *best)
The earliest possible assignment is searched for a job assuming a
particular parallel environment be used with a particular slot
number. If the slot number passed is 0 we start with the minimum
possible slot number for that job. The search starts with the
latest queue end time if DISPATCH_TIME_QUEUE_END was specified
rather than a real time value.
sge_assignment_t *best - herein we keep all important in/out information
dispatch_t - 0 ok got an assignment
1 no assignment at the specified time (???)
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: parallel_reservation_max_time_slots() is not MT safe
sge_select_parallel_environment() -- Decide about a PE assignment
static dispatch_t sge_select_parallel_environment(sge_assignment_t *best, lList
*pe_list)
When users use wildcard PE request such as -pe <pe_range> 'mpi8_*'
more than a single parallel environment can match the wildcard expression.
In case of 'now' assignments the PE that gives us the largest assignment
is selected. When scheduling a reservation we search for the earliest
assignment for each PE and then choose that one that finally gets us the
maximum number of slots.
The scheduler info messages are not cached. They are added globally and have
to be added for each job in the category. When the messages are updated
this has to be changed.
sge_assignment_t *best - herein we keep all important in/out information
lList *pe_list - the list of all parallel environments (PE_Type)
dispatch_t - 0 ok got an assignment
1 no assignment at the specified time (???)
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: sge_select_parallel_environment() is not MT safe
build_name_filter() -- fills in an array with complex nams, which can be used
as a filter.
void build_name_filter(const char **filter, lList *list, int t_name, int
*pos)
Takes an array of a given size and fills in complex names.
const char **filter - target for the filter strings. It has to be of sufficant size.
lList *list - a list of complexes, from which the names are extracted
int t_name - specifies the field which is used as a name
???
get_attribute_list() -- generates a list for all defined elements in a queue, host, global
static lList* get_attribute_list(lListElem *global, lListElem *host,
lListElem *queue, lList *centry_list)
Generates a list for all attributes defined at the given queue, host, global.
lListElem *global - global host
lListElem *host - host (or NULL, if only global attributes are important)
lListElem *queue - queue (or NULL if only host/global attributes are important)
lList *centry_list - system wide attribute config list
static lList* - list of attributes or NULL, if no attributes exist.
get_attribute_list_by_names() -- generates a list of attributes from the given names
static lList* get_attribute_list_by_names(lListElem *global, lListElem
*host, lListElem *queue, lList *centry_list, lList *attrnames)
Assembles a list of attributes for a given queue, host, global, which contains all
the specified elements. The general sort order is, global, host, queue. If an
element could not be found, it will not exist. If no elements exist, the function
will return NULL
lListElem *global - global host
lListElem *host - host (or NULL, if only global resources are asked for )
lListElem *queue - queue (or NULL, if only global / host resources are asked for)
lList *centry_list - the system wide attribut config list
lList *attrnames - ST_Type list of attribute names
static lList* - a CULL list of elements or NULL
is_attr_prior2() -- checks, if the set value in the structure has a higher priority
than the new one
static bool is_attr_prior2(lListElem *upper_el, double lower_value, int
t_value, int t_dominant)
Computes the priority between a given structure and its values and a new value. This
is done on some basic rules. If the value is not set (dominant == DOMINANT_TYPE_VALUE)
or which relational opperator is used. If this is not enough, the two values are compared
and based on the opperator, it returns a true or false:
if no value is set in the structure: false
if the relops are == or != : true
if the relops are >= or > : true, when the new value is smaller than the old one
if the relops are <= or < : true, when the new value is bigger than the old one
lListElem *upper_el - target structure
double lower_value - new value
int t_value - which field to use (CE_doubleval or CE_pj_doubleval)
int t_dominant - which dominant field to use (CE_dominant, CE_pj_dominant)
static bool - true, if the value in the structure has the higher priority
request_cq_rejected() -- Check, if -l request forecloses cluster queue
bool request_cq_rejected(const lList* hard_resource_list, const lListElem
*cq, const lList *centry_list, dstring *unsatisfied)
Do -l matching with the aim to foreclose the entire cluster queue.
Each cluster queue configuration profile must specify a fixed value
otherwise we can't rule out a cluster queue. Both complex_values and
queue resource limits are checked.
const lList* hard_resource_list - resource list -l (CE_Type)
const lListElem *cq - cluster queue (CQ_Type)
const lList *centry_list - complex entry list (CE_Type)
dstring *unsatisfied - diagnosis information, if rejected
bool - true, if the cluster queue is ruled out
MT-NOTE: request_cq_rejected() is MT safe
sge_dlib() -- lookup, load, and cache function from a dynamic library
void *sge_dlib(const char *key, const char *lib_name, const char *fn_name,
lib_cache_t **lib_cache_list)
const char *key - unique key for identifying function
const char *lib_name - dynamic library name
const char *fn_nam - function name
lib_cache_t **lib_cache_list - cache list (if NULL, we use a global cache)
void * - the address of the function
MT-NOTE: sge_free_load_list() is not MT safe
sge_job_slot_request() -- return static urgency jobs slot request
int sge_job_slot_request(lListElem *job, lList *pe_list)
For sequential jobs the static urgency job slot request is always 1.
For parallel jobs the static urgency job slot request depends on
static urgency slots as defined with sge_pe(5).
lListElem *job - the job (JB_Type)
lList *pe_list - the PE list (PE_Type)
int - Number of slots
In case of a wildcard parallel environment request the setting of the
first matching is used. Behaviour is undefined if multiple parallel
environments specify different settings!
task_get_duration() -- Determin tasks effective runtime limit
bool task_get_duration(u_long32 *duration, const lListElem *ja_task)
Determines the effictive runtime limit got by requested h_rt/s_rt or
by the resulting queues h_rt/s_rt
u_long32 *duration - tasks duration in seconds
const lListElem *ja_task - task element
bool - true
MT-NOTE: task_get_duration() is MT safe
sge_GetNumberOfOrders() -- returns the number of orders generated
int sge_GetNumberOfOrders(order_t *orders)
returns the number of orders generated
order_t *orders - a structure of orders
int - number of orders in the structure
MT-NOTE: sge_GetNumberOfOrders() is MT safe
sge_add_schedd_info() -- retrieves the messages and generates an order out
of it.
lList* sge_add_schedd_info(lList *or_list, int *global_mes_count, int
*job_mes_count)
retrieves all messages, puts them into an order package, and frees the
orginal messages. It also returns the number of global and job messages.
lList *or_list - int: the order list to which the message order is added
int *global_mes_count - out: global message count
int *job_mes_count - out: job message count
lList* - the order list
MT-NOTE: sge_add_schedd_info() is not MT safe
sge_create_orders() -- Create a new order-list or add orders to an existing one
lList* sge_create_orders(lList *or_list, u_long32 type, lListElem *job,
lListElem *ja_task, lList *granted, bool update_execd)
- If the or_list is NULL, a new one will be generated
- in case of a clear_pri order, teh ja_task is improtant. If NULL is put
in for ja_task, only the pendin tasks of the spedified job are set to NULL.
If a ja_task is put in, all tasks of the job are set to NULL
lList *or_list - the order list
u_long32 type - order type
lListElem *job - job
lListElem *ja_task - ja_task ref or NULL(there is only one case, where it can be NULL)
lList *granted - granted queue list
bool update_execd - should the execd get new ticket values?
lList* - returns the orderlist
MT-NOTE: sge_create_orders() is MT safe
sge_join_orders() -- generates one order list from the order structure
lLlist* sge_join_orders(order_t orders)
generates one order list from the order structure, and cleans the
the order structure. The orders, which have been send already, are
removed.
order_t orders - the order strucutre
lLlist* - a order list
MT-NOTE: sge_join_orders() is not safe
pe_match_static() -- Why not job to PE?
int pe_match_static(lListElem *job, lListElem *pe, lList *acl_list, bool
only_static_checks)
Checks if PE is suited for the job.
lListElem *job - ???
lListElem *pe - ???
lList *acl_list - ???
bool only_static_checks - ???
dispatch_t - DISPATCH_OK ok
DISPATCH_NEVER_CAT assignment will never be possible for all
jobs of that category
MT-NOTE: pe_restricted() is not MT safe
sge_qeti_list_add() -- Adds a resource utilization to QETI resource list
static int sge_qeti_list_add(lList **lpp, const char *name, lList*
rue_lp, double total, bool must_exist)
???
lList **lpp - QETI resource list
const char *name - Name of the resource
lList* rue_lp - Resource utilization entry (RUE_Type)
double total - Total resource amount
bool must_exist - If true the entry must exist in 'lpp'.
static int - 0 on success
MT-NOTE: sge_qeti_list_add() is not MT safe
sge_qeti_next_before() -- ???
void sge_qeti_next_before(sge_qeti_t *qeti, u_long32 start)
All queue end next references are set in a way that will
sge_qeti_next() return a time value that is before (i.e. less than)
start.
sge_qeti_t *qeti - ???
u_long32 start - ???
MT-NOTE: sge_qeti_next_before() is MT safe
check_and_debit_rqs_slots() -- Determine RQS limit slot amount and debit
static void check_and_debit_rqs_slots(sge_assignment_t *a, const char
*host, const char *queue, int *slots, int *slots_qend, dstring
*rule_name, dstring *rue_name, dstring *limit_name)
The function determines the final slot and slots_qend amount due
to all resource quota limitations that apply for the queue instance.
Both slot amounts get debited from the a->limit_list to keep track
of still available amounts per resource quota limit.
sge_assignment_t *a - Assignment data structure
const char *host - hostname
const char *queue - queuename
int *slots - needed/available slots
int *slots_qend - needed/available slots_qend
dstring *rule_name - caller maintained buffer
dstring *rue_name - caller maintained buffer
dstring *limit_name - caller maintained buffer
MT-NOTE: check_and_debit_rqs_slots() is MT safe
cqueue_shadowed() -- Check for cluster queue rule before current rule
static bool cqueue_shadowed(const lListElem *rule, sge_assignment_t *a)
Check whether there is any cluster queue specific rule before the
current rule.
const lListElem *rule - Current rule
sge_assignment_t *a - Scheduler assignment
static bool - True if shadowed
limit queue Q001 to F001=1
limit host gridware to F001=0 (--> returns 'true' due to 'Q001' meaning
that gridware can't be generally ruled out )
MT-NOTE: cqueue_shadowed() is MT safe
cqueue_shadowed_by() -- Check rules shadowing current cluster queue rule
static bool cqueue_shadowed_by(const char *cqname, const lListElem *rule,
sge_assignment_t *a)
Check if cluster queue in current rule is shadowed.
const char *cqname - Cluster queue name to check
const lListElem *rule - Current rule
sge_assignment_t *a - Assignment
static bool - True if shadowed
limits queues Q001,Q002 to F001=1
limits queues Q002,Q003 to F001=1 (--> returns 'true' for Q002 and 'false' for Q003)
MT-NOTE: cqueue_shadowed_by() is MT safe
debit_job_from_rqs() -- debits job in all relevant resource quotas
int debit_job_from_rqs(lListElem *job, lList *granted, lListElem* pe,
lList *centry_list)
The function debits in all relevant rule the requested amout of resources.
lListElem *job - job request (JB_Type)
lList *granted - granted list (JG_Type)
lListElem* pe - granted pe (PE_Type)
lList *centry_list - consumable resouces list (CE_Type)
int - always 0
MT-NOTE: debit_job_from_rqs() is not MT safe
host_shadowed() -- Check for host rule before current rule
static bool host_shadowed(const lListElem *rule, sge_assignment_t *a)
Check whether there is any host specific rule before the
current rule.
const lListElem *rule - Current rule
sge_assignment_t *a - Scheduler assignment
static bool - True if shadowed
limit host gridware to F001=1
limit queue Q001 to F001=0 (--> returns 'true' due to 'gridware' meaning
that Q001 can't be generally ruled out )
MT-NOTE: host_shadowed() is MT safe
host_shadowed_by() -- ???
static bool host_shadowed_by(const char *host, const lListElem *rule,
sge_assignment_t *a)
Check if host in current rule is shadowed.
const char *cqname - Host name to check
const lListElem *rule - Current rule
sge_assignment_t *a - Assignment
static bool - True if shadowed
limits hosts host1,host2 to F001=1
limits hosts host2,host3 to F001=1 (--> returns 'true' for host2 and 'false' for host3)
MT-NOTE: host_shadowed_by() is MT safe
is_cqueue_expand() -- Returns true if rule expands on cluster queues
bool is_cqueue_expand(const lListElem *rule)
Returns true if rule expands on cluster queues.
const lListElem *rule - RQR_Type
bool - True if rule expands on hosts
"queues {*}" returns true
"queues Q001,Q002" returns false
MT-NOTE: is_cqueue_expand() is MT safe
is_cqueue_global() -- Global rule with regards to cluster queues?
bool is_cqueue_global(const lListElem *rule)
const lListElem *rule - RQR_Type
bool - True if cluster queues play no role with the rule
MT-NOTE: is_cqueue_global() is MT safe
is_host_expand() -- Returns true if rule expands on hosts
bool is_host_expand(const lListElem *rule)
Returns true if rule expands on hosts.
const lListElem *rule - RQR_Type
bool - True if rule expands on hosts
"hosts {*}" returns true
"hosts @allhosts" returns false
MT-NOTE: is_host_expand() is MT safe
is_host_global() -- Global rule with regards to hosts?
bool is_host_global(const lListElem *rule)
Return true if hosts play no role with the rule
const lListElem *rule - RQR_Type
bool - True if hosts play no role with the rule
MT-NOTE: is_host_global() is MT safe
parallel_limit_slots_by_time() -- Determine number of slots avail. within
time frame
static dispatch_t parallel_limit_slots_by_time(const sge_assignment_t *a,
lList *requests, int *slots, int *slots_qend, lListElem *centry, lListElem
*limit, dstring rue_name)
???
const sge_assignment_t *a - job info structure (in)
lList *requests - Job request list (CE_Type)
int *slots - out: free slots
int *slots_qend - out: free slots in the far far future
lListElem *centry - Load information for the resource
lListElem *limit - limitation (RQRL_Type)
dstring rue_name - rue_name saved in limit sublist RQRL_usage
lListElem *qep - queue instance (QU_Type)
static dispatch_t - DISPATCH_OK got an assignment
- DISPATCH_NEVER_CAT no assignment for all jobs of that category
MT-NOTE: parallel_limit_slots_by_time() is not MT safe
parallel_rqs_slots_by_time() -- Determine number of slots avail within
time frame
dispatch_t parallel_rqs_slots_by_time(const sge_assignment_t *a,
int *slots, int *slots_qend, const char *host, const char *queue)
This function iterates for a queue instance over all resource quota sets
and evaluates the number of slots available.
const sge_assignment_t *a - job info structure (in)
int *slots - out: # free slots
int *slots_qend - out: # free slots in the far far future
lListElem *qep - QU_Type Elem
static dispatch_t - DISPATCH_OK got an assignment
- DISPATCH_NEVER_CAT no assignment for all jobs of that category
MT-NOTE: parallel_rqs_slots_by_time() is not MT safe
rqs_by_slots() -- Check queue instance suitability due to RQS
dispatch_t rqs_by_slots(sge_assignment_t *a, const char *queue,
const char *host, u_long32 *tt_rqs_all, bool *is_global,
dstring *rue_string, dstring *limit_name, dstring *rule_name)
Checks (or determines earliest time) queue instance suitability
according to resource quota set limits.
For performance reasons RQS verification results are cached in
a->limit_list. In addition unsuited queues and hosts are collected
in a->skip_cqueue_list and a->skip_host_list so that ruling out
chunks of queue instance becomes quite cheap.
sge_assignment_t *a - assignment
const char *queue - cluster queue name
const char *host - host name
u_long32 *tt_rqs_all - returns earliest time over all resource quotas
bool *is_global - returns true if result is valid for any other queue
dstring *rue_string - caller maintained buffer
dstring *limit_name - caller maintained buffer
dstring *rule_name - caller maintained buffer
u_long32 tt_best - time of best solution found so far
static dispatch_t - usual return values
MT-NOTE: rqs_by_slots() is MT safe
rqs_can_optimize() -- Poke whether a queue/host negation can be made
static void rqs_can_optimize(const lListElem *rule, bool *host, bool
*queue, sge_assignment_t *a)
A global limit was hit with 'rule'. This function helps to determine
to what extent we can profit from that situation. If there is no
previous matching rule within the same rule set any other queue/host
can be skipped.
const lListElem *rule - Rule
bool *host - Any previous rule with a host scope?
bool *queue - Any previous rule with a queue scope?
sge_assignment_t *a - Scheduler assignment
MT-NOTE: rqs_can_optimize() is MT safe
rqs_exceeded_sort_out() -- Rule out queues/hosts whenever possible
bool rqs_exceeded_sort_out(sge_assignment_t *a, const lListElem *rule,
const dstring *rule_name, const char* queue_name, const char* host_name)
This function tries to rule out hosts and cluster queues after a
quota exceeding was found for a limitation rule with specific queue
instance.
When a limitation was exceeded that applies to the entire
cluster 'true' is returned, 'false' otherwise.
sge_assignment_t *a - Scheduler assignment type
const lListElem *rule - The exceeded rule
const dstring *rule_name - Name of the rule (monitoring only)
const char* queue_name - Cluster queue name
const char* host_name - Host name
bool - True upon global limits exceeding
MT-NOTE: rqs_exceeded_sort_out() is MT safe
rqs_exceeded_sort_out_par() -- Rule out queues/hosts whenever possible
void rqs_exceeded_sort_out_par(sge_assignment_t *a, const lListElem
*rule, const dstring *rule_name, const char* queue_name, const char*
host_name)
Function wrapper around rqs_exceeded_sort_out() for parallel jobs.
In contrast to the sequential case global limit exceeding is handled
by adding all cluster queue names to the a->skip_cqueue_list.
sge_assignment_t *a - Scheduler assignment type
const lListElem *rule - The exceeded rule
const dstring *rule_name - Name of the rule (monitoring only)
const char* queue_name - Cluster queue name
const char* host_name - Host name
MT-NOTE: rqs_exceeded_sort_out_par() is MT safe
rqs_excluded_cqueues() -- Find excluded queues
static void rqs_excluded_cqueues(const lListElem *rule, sge_assignment_t *a)
Find queues that are excluded by previous rules.
const lListElem *rule - The rule
sge_assignment_t *a - Scheduler assignment
limit projects {*} queues !Q001 to F001=1
limit to F001=0 ( ---> returns Q001 in a->skip_cqueue_list)
MT-NOTE: rqs_excluded_cqueues() is MT safe
rqs_excluded_hosts() -- Find excluded hosts
static void rqs_excluded_hosts(const lListElem *rule, sge_assignment_t *a)
Find hosts that are excluded by previous rules.
const lListElem *rule - The rule
sge_assignment_t *a - Scheduler assignment
limit projects {*} queues !gridware to F001=1
limit to F001=0 ( ---> returns gridware in skip_host_list)
MT-NOTE: rqs_excluded_hosts() is MT safe
rqs_expand_cqueues() -- Add all matching cqueues to the list
void rqs_expand_cqueues(const lListElem *rule)
The names of all cluster queues that match the rule are added to
the skip list without duplicates.
const lListElem *rule - RQR_Type
MT-NOTE: rqs_expand_cqueues() is not MT safe
rqs_expand_hosts() -- Add all matching hosts to the list
void rqs_expand_hosts(const lListElem *rule, lList **skip_host_list,
const lList *host_list, lList *hgrp_list)
The names of all hosts that match the rule are added to
the skip list without duplicates.
const lListElem *rule - RQR_Type
const lList *host_list - EH_Type
MT-NOTE: rqs_expand_hosts() is MT safe
rqs_limitation_reached() -- is the limitation reached for a queue instance
static bool rqs_limitation_reached(sge_assignment_t *a, lListElem *rule,
const char* host, const char* queue)
The function verifies no limitation is reached for the specific job request
and queue instance
sge_assignment_t *a - job info structure
const lListElem *rule - resource quota rule (RQR_Type)
const char* host - host name
const char* queue - queue name
u_long32 *start - start time of job
static dispatch_t - DISPATCH_OK job can be scheduled
DISPATCH_NEVER_CAT no jobs of this category will be scheduled
DISPATCH_NOT_AT_TIME job can be scheduled later
DISPATCH_MISSING_ATTR rule does not match requested attributes
MT-NOTE: rqs_limitation_reached() is not MT safe
rqs_match_assignment() -- match resource quota rule any queue instance
static bool rqs_match_assignment(const lListElem *rule, sge_assignment_t
*a)
Check whether a resource quota rule can match any queue instance. If
if does not match due to users/projects/pes scope one can rule this
out.
Note: As long as rqs_match_assignment() is not used for parallel jobs
passing NULL as PE request is perfectly fine.
const lListElem *rule - Resource quota rule
sge_assignment_t *a - Scheduler assignment
static bool - True if it matches
MT-NOTE: rqs_match_assignment() is MT safe
rqs_set_dynamical_limit() -- evaluate dynamical limit
bool rqs_set_dynamical_limit(lListElem *limit, lListElem
*global_host, lListElem *exec_host, lList *centry)
The function evaluates if necessary the dynamical limit for a host and
sets the evaluated double value in the given limitation element (RQRL_dvalue).
An evaluation is necessary if the limit boolean RQRL_dynamic is true. This
field is set by qmaster during the rule set verification
lListElem *limit - limitation (RQRL_Type)
lListElem *global_host - global host (EH_Type)
lListElem *exec_host - exec host (EH_Type)
lList *centry - consumable resource list (CE_Type)
bool - always true
MT-NOTE: rqs_set_dynamical_limit() is MT safe
sge_user_is_referenced_in_rqs() -- search for user reference in rqs
bool sge_user_is_referenced_in_rqs(const lList *rqs, const char *user,
lList *acl_list)
Search for a user reference in the resource quota sets
const lList *rqs - resource quota set list
const char *user - user to search
const char *group - user's group
lList *acl_list - acl list for user resolving
bool - true if user was found
false if user was not found
MT-NOTE: sge_user_is_referenced_in_rqs() is MT safe
add_calendar_to_schedule() -- addes the queue calendar to the resource
schedule
static void add_calendar_to_schedule(lList *queue_list)
Adds the queue calendars to the resource schedule. It is using
the slot entry for simulating and enabled / disabled calendar.
lList *queue_list - all queues, which can posibly run jobs
u_long32 now - now time of assignment
MT-NOTE: add_calendar_to_schedule() is MT safe
add_job_utilization() -- Debit assignments' utilization from all schedules
int add_job_utilization(const sge_assignment_t *a, const char *type)
The resouce utilization of an assignment is debited into the schedules
of global, host and queue instance resource containers and limitation
rule sets. For parallel jobs debiting is made done from the parallel
environment schedule.
const sge_assignment_t *a - The assignment
const char *type - A string that is used to monitor assignment
type
bool for_job_scheduling - utilize for job or for advance reservation
int -
MT-NOTE: add_job_utilization() is MT safe
newResourceElem() -- creates new resource schedule entry
static lListElem* newResourceElem(u_long32 time, double amount)
creates new resource schedule entry and returns it
u_long32 time - specific time
double amount - the utilized amount
static lListElem* - new resource schedule entry
MT-NOTE: newResourceElem() is MT safe
prepare_resource_schedules() -- Debit non-pending jobs in resource schedule
static void prepare_resource_schedules(const lList *running_jobs, const
lList *suspended_jobs, lList *pe_list, lList *host_list, lList
*queue_list, lList *centry_list, lList *rqs_list)
In order to reflect current and future resource utilization of running
and suspended jobs in the schedule we iterate through all jobs and debit
resources requested by those jobs.
const lList *running_jobs - The running ones (JB_Type)
const lList *suspended_jobs - The susepnded ones (JB_Type)
lList *pe_list - ???
lList *host_list - ???
lList *queue_list - ???
lList *rqs_list - configured resource quota sets
lList *centry_list - ???
lList *acl_list - ???
lList *hgroup_list - ???
lList *prepare_resource_schedules - create schedule for job or advance reservation
scheduling
bool for_job_scheduling - prepare for job or for advance reservation
u_long32 now - now time of assignment
MT-NOTE: prepare_resource_schedules() is not MT safe
rqs_add_job_utilization() -- Debit assignment's utilization in a limitation
rule
static int rqs_add_job_utilization(lListElem *jep, u_long32 task_id,
const char *type, lListElem *rule, dstring rue_name, lList *centry_list,
int slots, const char *obj_name, u_long32 start_time, u_long32 end_time,
bool is_master_task)
???
lListElem *jep - job element (JB_Type)
u_long32 task_id - task id to debit
const char *type - String denoting type of utilization entry
lListElem *rule - limitation rule (RQR_Type)
dstring rue_name - rue_name where to debit
lList *centry_list - master centry list (CE_Type)
int slots - slots to debit
const char *obj_name - name of the object where to debit
u_long32 start_time - start time of utilization
u_long32 end_time - end time of utilization
bool is_master_task - is this the master task going to be debit
static int - number of modified limits
MT-NOTE: rqs_add_job_utilization() is MT safe
serf_exit() -- Closes SERF
void serf_exit(void)
All operations requited to cleanly shutdown the SERF are done.
MT-NOTE: serf_exit() is MT safe
serf_init() -- Initializes SERF
void serf_init(record_schedule_entry_func_t write, new_schedule_func_t
newline)
MT-NOTE: serf_init() is not MT safe
serf_new_interval() -- Indicate a new scheduling run
void serf_new_interval(u_long32 time)
When a new scheduling run is started serf_new_interval() shall be
called to indicate this. This allows assigning of schedule entry
records to different schedule runs.
u_long32 time - The time when the schedule run was started.
MT-NOTE: (1) serf_new_interval() is MT safe if no recording function
MT-NOTE: was registered via serf_init().
MT-NOTE: (2) Otherwise MT safety of serf_new_interval() depends on
MT-NOTE: MT safety of registered recording function
serf_record_entry() -- Add a new schedule entry record
void serf_record_entry(u_long32 job_id, u_long32 ja_taskid, const char
*state, u_long32 start_time, u_long32 end_time, char level_char, const
char *object_name, const char *name, double utilization)
The entirety of all information passed to this function describes
the schedule that was created during a scheduling interval of a
Grid Engine scheduler. To reflect multiple resource debitations
of a job multiple calls to serf_record_entry() are required. For
parallel jobs the serf_record_entry() is called one times with a
'P' as level_char.
u_long32 job_id - The job id
u_long32 ja_taskid - The task id
const char *type - A string indicating the reason why the
utilization was put into the schedule:
RUNNING - Job was running before scheduling run
SUSPENDED - Job was suspended before scheduling run
MIGRATING - Job being preempted (unused)
STARTING - Job will be started
RESERVING - Job reserves resources
u_long32 start_time - Start of the resource utilization
u_long32 end_time - End of the resource utilization
char level_char - Q - Queue
H - Host
G - Global
P - Parallel Environment (PE)
const char *object_name - Name of Queue/Host/Global/PE
const char *name - Resource name
double utilization - Utilization amount
MT-NOTE: (1) serf_record_entry() is MT safe if no recording function
MT-NOTE: was registered via serf_init().
MT-NOTE: (2) Otherwise MT safety of serf_record_entry() depends on
MT-NOTE: MT safety of registered recording function
set_utilization() -- adds one specific calendar entry to the resource schedule
static void set_utilization(lList *uti_list, u_long32 from, u_long32
till, double uti)
This set utilization function is unique for calendars. It removes all other
uti settings in the given time interval and replaces it with the given one.
lList *uti_list - the uti list for a specifiy resource and queue
u_long32 from - starting time for this uti
u_long32 till - endtime for this uti.
double uti - utilization (needs to bigger than 1 (schould be max)
MT-NOTE: set_utilization() is MT safe
sge_qeti_first() --
u_long32 sge_qeti_first(sge_qeti_t *qeti)
Initialize/Reinitialize Queue End Time Iterator. All queue end next
references are initialized to the queue end of all resourece instances.
Before we return the time that is most in the future queue end next
references are switched to the next entry that is earlier than the time
that was returned.
sge_qeti_t *qeti - ???
u_long32 -
MT-NOTE: sge_qeti_first() is MT safe
sge_qeti_next() -- ???
u_long32 sge_qeti_next(sge_qeti_t *qeti)
Return next the time that is most in the future. Then queue end next
references are switched to the next entry that is earlier than the time
that was returned.
sge_qeti_t *qeti - ???
u_long32 -
MT-NOTE: sge_qeti_next() is MT safe
sge_qeti_release() -- Release queue end time iterator
void sge_qeti_release(sge_qeti_t *qeti)
Release all resources of the queue end time iterator. Refered
resource utilization diagrams are not affected.
sge_qeti_t *qeti - ???
MT-NOTE: sge_qeti_release() is MT safe
utilization_add() -- Debit a jobs resource utilization
int utilization_add(lListElem *cr, u_long32 start_time, u_long32
duration, double utilization, u_long32 job_id, u_long32 ja_taskid,
u_long32 level, const char *object_name, const char *type)
A jobs resource utilization is debited into the resource
utilization diagram at the given time for the given duration.
lListElem *cr - Resource utilization entry (RUE_Type)
u_long32 start_time - Start time of utilization
u_long32 duration - Duration
double utilization - Amount
u_long32 job_id - Job id
u_long32 ja_taskid - Task id
u_long32 level - *_TAG
const char *object_name - The objects name
const char *type - String denoting type of utilization entry.
bool is_job - reserve for job or for advance reservation
bool implicit_non_exclusive - add implicit entry for non-exclusive jobs
requesting a exclusive centry
int - 0 on success
MT-NOTE: utilization_add() is not MT safe
utilization_below() -- Determine earliest time util is below max_util
u_long32 utilization_below(const lListElem *cr, double max_util, const
char *object_name)
Determine and return earliest time utilization is below max_util.
const lListElem *cr - Resource utilization entry (RUE_utilized)
double max_util - The maximum utilization we're asking
const char *object_name - Name of the queue/host/global for monitoring
purposes.
bool for_excl_request - match for exclusive request
u_long32 - The earliest time or DISPATCH_TIME_NOW.
MT-NOTE: utilization_below() is MT safe
utilization_max() -- Determine max utilization within timeframe
double utilization_max(const lListElem *cr, u_long32 start_time, u_long32
duration)
Determines the maximum utilization at the given timeframe.
const lListElem *cr - Resource utilization entry (RUE_utilized)
u_long32 start_time - Start time of the timeframe
u_long32 duration - Duration of timeframe
bool for_excl_request - For exclusive request
double - Maximum utilization
MT-NOTE: utilization_max() is MT safe
utilization_print_to_dstring() -- Print resource utilization to dstring
bool utilization_print_to_dstring(const lListElem *this_elem, dstring
*string)
Print resource utlilzation as plain number to dstring.
const lListElem *this_elem - A RUE_Type element
dstring *string - The string
bool - error state
true - success
false - error
MT-NOTE: utilization_print_to_dstring() is MT safe
utilization_queue_end() -- Determine utilization at queue end time
double utilization_queue_end(const lListElem *cr)
Determine utilization at queue end time. Jobs that last until
ever can cause a non-zero utilization.
const lListElem *cr - Resource utilization entry (RUE_utilized)
bool for_excl_request - For exclusive request
double - queue end utilization
MT-NOTE: utilization_queue_end() is MT safe
sge_get_schedd_text() -- transformes a id into a info message
const char* sge_get_schedd_text(int nr)
transforms an id into an info message
int nr - info id
const char* - info message
MT-NOTE: sge_get_schedd_text() is MT safe
access_cq_rejected() -- Check, if cluster queue rejects user/project
static bool access_cq_rejected(const char *user, const char *group, const
lList *acl_list, const lListElem *cq)
???
const char *user - Username
const char *group - Groupname
const lList *acl_list - List of access list definitions
const lListElem *cq - Cluster queue
static bool - True, if rejected
MT-NOTE: access_cq_rejected() is MT safe
add_pe_slots_to_category() -- defines an array of valid slot values
static bool add_pe_slots_to_category(category_use_t *use_category,
u_long32 *max_slotsp, lListElem *pe, int min_slots, int max_slots, lList
*pe_range)
In case of pe ranges this function allocates memory and fills it with
valid pe slot values. If a category is set, it stores them the category
for further jobs.
category_use_t *use_category - category caching structure, must not be NULL
u_long32 *max_slotsp - number of different slot settings
lListElem *pe - pe, must not be NULL
int min_slots - min slot setting (pe range)
int max_slots - max slot setting (pe range)
lList *pe_range - pe range, must not be NULL
static bool - true, if successful
MT-NOTE: add_pe_slots_to_category() is MT safe
clean_up_parallel_job() -- removes tags
static void clean_up_parallel_job(sge_assignment_t *a)
during pe job dispatch are man queues and hosts tagged. This
function removes the tags.
sge_assignment_t *a - the resource structure
MT-NOTE: clean_up_parallel_job() is not MT safe
clear_resource_tags() -- removes the tags from a resource request.
static void clear_resource_tags(lList *resources, u_long32 max_tag)
Removes the tags from the given resource list. A tag is only removed
if it is smaller or equal to the given tag value. The tag value "MAX_TAG" results
in removing all existing tags, or the value "HOST_TAG" removes queue and host
tags but keeps the global tags.
lList *resources - list of job requests.
u_long32 max_tag - max tag element
compute_soft_violations() -- counts the violations in the request for a given host or queue
static int compute_soft_violations(lListElem *queue, int violation, lListElem *job,lList *load_attr, lList *config_attr,
lList *actual_attr, lList *centry_list, u_long32 layer, double lc_factor, u_long32 tag)
this function checks if the current resources can satisfy the requests. The resources come from the global host, a
given host or the queue. The function returns the number of violations.
const sge_assignment_t *a - job info structure
lListElem *queue - should only be set, when one using this method on queue level
int violation - the number of previous violations. This is needed to get a correct result on queue level.
lList *load_attr - the load attributes, only when used on hosts or global
lList *config_attr - a list of custom attributes (CE_Type)
lList *actual_attr - a list of custom consumables, they contain the current usage of these attributes (RUE_Type)
u_long32 layer - the current layer flag
double lc_factor - should be set, when load correction has to be done.
u_long32 tag - the current layer tag. (GLOGAL_TAG, HOST_TAG, QUEUE_TAG)
static int - the number of violations ( = (prev. violations) + (new violations in this run)).
cqueue_match_static() -- Does cluster queue match the job?
static dispatch_t cqueue_match_static(const char *cqname,
sge_assignment_t *a)
The function tries to find reasons (-q, -l and -P) why the
entire cluster is not suited for the job.
const char *cqname - Cluster queue name
sge_assignment_t *a - ???
static dispatch_t - Returns DISPATCH_OK or DISPATCH_NEVER_CAT
MT-NOTE: cqueue_match_static() is MT safe
fill_category_use_t() -- fills the category_use_t structure.
void fill_category_use_t(sge_assignment_t *a, category_use_t
*use_category, const char *pe_name)
If a cache structure for the given PE does not exist, it
will generate the necessary data structures.
sge_assignment_t *a - job info structure (in)
category_use_t *use_category - category info structure (out)
const char* pe_name - the current pe name or "NONE"
MT-NOTE: fill_category_use_t() is MT safe
get_attribute() -- looks for an attribut, but only for one level (for host, global, or queue)
static lListElem* get_attribute(const char *attrname, lList *config_attr,
lList *actual_attr, lList *load_attr, lList *centry_list, lListElem
*queue, lListElem *rep, u_long32 layer, double lc_factor, dstring *reason)
Extracts the attribut specified with 'attrname' and finds the
more important one, if it is defined multiple times on the same
level. It only cares about one level.
If the attribute is a consumable, one can specify a point in time and a duration.
This will get the caller the min amount of that resource during the time frame.
const char *attrname - attribute name one is looking for
lList *config_attr - user defined attributes (CE_Type)
lList *actual_attr - current usage of consumables (RUE_Type)
lList *load_attr - load attributes
lList *centry_list - the system wide attribute configuration
lListElem *queue - the current queue, or null, if one works on hosts
u_long32 layer - the current layer
double lc_factor - the load correction value
dstring *reason - space for error messages or NULL
bool zero_utilization - ???
u_long32 start_time - begin of the time interval, one asks for the resource
u_long32 duration - the duration the interval
static lListElem* - the element one was looking for or NULL
get_attribute_by_Name() -- returns an attribut by name
void lListElem* get_attribute_by_Name(lListElem* global, lListElem *host,
lListElem *queue, const char* attrname, lList *centry_list, char *
reason, int reason_size)
It looks into the different configurations on host, global and queue and returns
the attribute, which was asked for. It the attribut is defined multiple times, only
the valid one is returned.
lListElem* global - the global host
lListElem *host - a given host can be null, than only the global host is important
lListElem *queue - a queue on the given host, can be null, than only the host and global ist important
const char* attrname - the attribut name one is looking ofr
lList *centry_list - the system wide attribut config list
char *reason - memory for the error message
int reason_size - the max length of an error message
void lListElem* - the element one is looking for (a copy) or NULL.
get_queue_resource() -- extracts attribut information from the queue
static lListElem* get_queue_resource(lListElem *queue, lList *centry_list, const char *attrname)
All fixed queue attributes are directly coded into the queue structure. These have to extraced
and formed into a CE structure. That is, what this function does. It takes a name for an attribut
and returns a full CE structure, if the attribut is set in the queue. Otherwise it returns NULL.
lListElem *queue_elem -
lListElm *queue -
const char *attrname - name of the attribute.
bool -
host_time_by_slots() -- Return time when host slots are available
int host_time_by_slots(int slots, u_long32 *start, u_long32 duration,
int *host_soft_violations, lListElem *job, lListElem *ja_task, lListElem
*hep, lList *centry_list, lList *acl_list)
The time when the specified slot amount is available at the host
is determined. Behaviour depends on input/output parameter start
DISPATCH_TIME_NOW
0 an assignment is possible now
1 no assignment now but later
-1 assignment never possible for all jobs of the same category
-2 assignment never possible for that particular job
<any other time>
0 an assignment is possible at the specified time
1 no assignment at specified time but later
-1 assignment never possible for all jobs of the same category
-2 assignment never possible for that particular job
DISPATCH_TIME_QUEUE_END
0 an assignment is possible and the start time is returned
-1 assignment never possible for all jobs of the same category
-2 assignment never possible for that particular job
int slots - ???
u_long32 *start - ???
u_long32 duration - ???
int *host_soft_violations - ???
lListElem *job - ???
lListElem *ja_task - ???
lListElem *hep - ???
lList *centry_list - ???
lList *acl_list - ???
interactive_cq_rejected() -- Check, if -now yes rejects cluster queue
static bool interactive_cq_rejected(const lListElem *cq)
Returns true if -now yes jobs can not be run in cluster queue
const lListElem *cq - cluster queue (CQ_Type)
static bool - True, if rejected
MT-NOTE: interactive_cq_rejected() is MT safe
is_attr_prior() -- compares two attribut instances with each other
static bool is_attr_prior(lListElem *upper_el, lListElem *lower_el)
checks if the first given attribut instance has a higher priority than
second instance.
if the first is NULL, it returns false
if the second or the second and first is NULL, it returns true
if the "==" or "!=" operators are used, it is true
if both are the same, it may returns false.
otherwise it computes the minimum or maximum between the values.
lListElem *upper_el - attribut, which should be overridden by the second one.
lListElem *lower_el - attribut, which want to override the first one.
static bool - true, when the first attribut has a higher priority.
is_requested() -- Returns true if specified resource is requested.
bool is_requested(lList *req, const char *attr)
Returns true if specified resource is requested. Both long name
and shortcut name are checked.
lList *req - The request list (CE_Type)
const char *attr - The resource name.
bool - true if requested, otherwise false
MT-NOTE: is_requested() is MT safe
load_locate_elem() -- locates a consumable category in the given load list
static lListElem* load_locate_elem(lList *load_list, lListElem
*global_consumable, lListElem *host_consumable, lListElem
*queue_consumable)
lList *load_list - the load list to work on
lListElem *global_consumable - a ref to the global consumable
lListElem *host_consumable - a ref to the host consumable
lListElem *queue_consumable - a ref to the queue consumable
static lListElem* - NULL, or the category element from the load list
MT-NOTE: load_locate_elem() is MT safe
load_np_value_adjustment() -- adjusts np load values for the number of processors
static int load_np_value_adjustment(const char* name, lListElem *hep,
double *load_correction)
Tests the load value name for "np_*". If this pattern is found, it will
retrieve the number of processors and adjusts the load_correction accordingly.
If the pattern is not found, it does nothing and returns 0 for number of processors.
const char* name - load value name
lListElem *hep - host object
double *load_correction - current load_correction for further corrections
static int - number of processors, or 0 if it was called on a none np load value
MT-NOTE: load_np_value_adjustment() is MT safe
match_static_advance_reservation() -- Do matching that depends not on queue
or host
static dispatch_t match_static_advance_reservation(const sge_assignment_t
*a)
Checks whether a job that requests a advance reservation can be scheduled.
The job can be scheduled if the advance reservation is in state "running".
const sge_assignment_t *a - assignment to match
static dispatch_t - DISPATCH_OK on success
DISPATCH_NEVER_CAT on error
MT-NOTE: match_static_advance_reservation() is MT safe
parallel_assignment() -- Can we assign with a fixed PE/slot/time
int parallel_assignment(sge_assignment_t *assignment)
Returns if possible an assignment for a particular PE with a
fixed slot at a fixed time.
sge_assignment_t *a -
category_use_t *use_category - has information on how to use the job category
dispatch_t - 0 ok got an assignment
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: parallel_assignment() is not MT safe
parallel_available_slots() -- Check if number of PE slots is available
dispatch_t - 0 ok got an assignment
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
MT-NOTE: parallel_available_slots() is not MT safe
parallel_host_slots() -- Return host slots available at time period
The maximum amount available at the host for the specified time period
is determined.
parallel_tag_hosts_queues() -- Determine host slots and tag queue(s) accordingly
For a particular job the maximum number of slots that could be served
at that host is determined in accordance with the allocation rule and
returned. The time of the assignment can be either DISPATCH_TIME_NOW
or a specific time, but never DISPATCH_TIME_QUEUE_END.
In those cases when the allocation rule allows more than one slot be
served per host it is necessary to also consider per queue possibly
specified load thresholds. This is because load is global/per host
concept while load thresholds are a queue attribute.
In those cases when the allocation rule gives us neither a fixed amount
of slots required nor an upper limit for the number per host slots (i.e.
$fill_up and $round_robin) we must iterate through all slot numbers from
1 to the maximum number of slots "total_slots" and check with each slot
amount whether we can get it or not. Iteration stops when we can't get
more slots the host based on the queue limitations and load thresholds.
As long as only one single queue at the host is eligible for the job the
it is sufficient to check with each iteration whether the corresponding
number of slots can be served by the host and it's queue or not. The
really sick case however is when multiple queues are eligible for a host:
Here we have to determine in each iteration step also the maximum number
of slots each queue could get us by doing a per queue iteration from the
1 up to the maximum number of slots we're testing. The optimization in
effect here is to check always only if we could get more slots than with
the former per host slot amount iteration.
sge_assignment_t *a -
lListElem *hep - current host
lListElem *global - global host
int *slots - out: # free slots
int *slots_qend - out: # free slots in the far far future
int global_soft_violations - # of global soft violations
bool *master_host - out: if true, found a master host
category_use_t *use_category - int/out : how to use the job category
static dispatch_t - 0 ok got an assignment
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: parallel_tag_hosts_queues() is not MT safe
parallel_tag_queues_suitable4job() -- Tag queues/hosts for
a comprehensive/parallel assignment
static int parallel_tag_queues_suitable4job(sge_assignment_t
*assignment)
We tag the number of available slots for that job at global, host and
queue level under consideration of all constraints of the job. We also
mark those queues that are suitable as a master queue as possible master
queues and count the number of violations of the job's soft request.
The method below is named comprehensive since it does the tagging game
for the whole parallel job and under consideration of all available
resources that could help to satisfy the job's request. This is necessary
to prevent consumable resource limitation at host/global level multiple
times.
While tagging we also set queues QU_host_seq_no based on the sort
order of each host. Assumption is the host list passed is sorted
according to the load formula.
sge_assignment_t *assignment - ???
category_use_t use_category - information on how to use the job category
static dispatch_t - 0 ok got an assignment
1 no assignment at the specified time
-2 assignment will never be possible for that particular job
MT-NOTE: parallel_tag_queues_suitable4job() is not MT safe
pe_cq_rejected() -- Check, if -pe pe_name rejects cluster queue
static bool pe_cq_rejected(const char *pe_name, const lListElem *cq)
Match a jobs -pe 'pe_name' with pe_list cluster queue configuration.
True is returned if the parallel environment has no access.
const char *project - the pe request of a job (no wildcard)
const lListElem *cq - cluster queue (CQ_Type)
static bool - True, if rejected
MT-NOTE: pe_cq_rejected() is MT safe
project_cq_rejected() -- Check, if -P project rejects cluster queue
static bool project_cq_rejected(const char *project, const lListElem *cq)
Match a jobs -P 'project' with project/xproject cluster queue configuration.
True is returned if the project has no access.
const char *project - the project of a job or NULL
const lListElem *cq - cluster queue (CQ_Type)
static bool - True, if rejected
MT-NOTE: project_cq_rejected() is MT safe
rc_time_by_slots() -- checks weather all resource requests on one level
are fulfilled
static int rc_time_by_slots(lList *requested, lList *load_attr, lList
*config_attr, lList *actual_attr, lList *centry_list, lListElem *queue,
bool allow_non_requestable, char *reason, int reason_size, int slots,
u_long32 layer, double lc_factor, u_long32 tag)
Checks, weather all requests, default requests and implicit requests on this
this level are fulfilled.
With reservation scheduling the earliest start time due to resources of the
resource container is the maximum of the earliest start times for all
resources comprised by the resource container that requested by a job.
lList *requested - list of attribute requests
lList *load_attr - list of load attributes or null on queue level
lList *config_attr - list of user defined attributes
lList *actual_attr - usage of all consumables (RUE_Type)
lList *centry_list - system wide attribute config. list (CE_Type)
lListElem *queue - current queue or NULL on global/host level
bool allow_non_requestable - allow none requestabales?
char *reason - error message
int reason_size - max error message size
int slots - number of slots the job is looking for
u_long32 layer - current layer flag
double lc_factor - load correction factor
u_long32 tag - current layer tag
u_long32 *start_time - in/out argument for start time
u_long32 duration - jobs estimated total run time
const char *object_name - name of the object used for monitoring purposes
dispatch_t -
MT-NOTES: is not thread save. uses a static buffer
Important:
we have some special behavior, when slots is set to -1.
ri_slots_by_time() -- Determine number of slots avail. within time frame
static dispatch_t ri_slots_by_time(const sge_assignment_t *a, int *slots,
int *slots_qend, lList *rue_list, lListElem *request, lList *load_attr,
lList *total_list, lListElem *queue, u_long32 layer, double lc_factor,
dstring *reason, bool allow_non_requestable, bool no_centry, const char
*object_name)
The number of slots available with a resource can be zero for static
resources or is determined based on maximum utilization within the
specific time frame, the total amount of the resource and the per
task request of the parallel job (ri_slots_by_time())
const sge_assignment_t *a - ???
int *slots - Returns maximum slots that can be served
within the specified time frame.
int *slots_qend - Returns the maximum possible number of slots
lList *rue_list - Resource utilization (RUE_Type)
lListElem *request - Job request (CE_Type)
lList *load_attr - Load information for the resource
lList *total_list - Total resource amount (CE_Type)
lListElem *queue - Queue instance (QU_Type) for queue-based resources
u_long32 layer - DOMINANT_LAYER_{GLOBAL|HOST|QUEUE}
double lc_factor - load correction factor
dstring *reason - diagnosis information if no rsrc available
bool allow_non_requestable - ???
bool no_centry - ???
const char *object_name - ???
static dispatch_t -
MT-NOTE: ri_slots_by_time() is not MT safe
ri_time_by_slots() -- Determine availability time through slot number
int ri_time_by_slots(lListElem *rep, lList *load_attr, lList
*config_attr, lList *actual_attr, lList *centry_list, lListElem *queue,
char *reason, int reason_size, bool allow_non_requestable, int slots,
u_long32 layer, double lc_factor)
Checks for one level, if one request is fulfilled or not.
With reservation scheduling the earliest start time due to
availability of the resource instance is determined by ensuring
non-consumable resource requests are fulfilled or by finding the
earliest time utilization of a consumable resource is below the
threshold required for the request.
sge_assignment_t *a - assignment object that holds job specific scheduling relevant data
lListElem *rep - requested attribute
lList *load_attr - list of load attributes or null on queue level
lList *config_attr - list of user defined attributes (CE_Type)
lList *actual_attr - usage of user consumables (RUE_Type)
lListElem *queue - the current queue, or null on host level
dstring *reason - target for error message
bool allow_non_requestable - allow none requestable attributes?
int slots - the number of slots the job is looking for?
u_long32 layer - the current layer
double lc_factor - load correction factor
u_long32 *start_time - in/out argument for start time
const char *object_name - name of the object used for monitoring purposes
dispatch_t -
sequential_tag_queues_suitable4job() -- ???
The start time of a queue is always returned using the QU_available_at
field.
The overall behaviour of this function is somewhat dependent on the
value that gets passed to assignment->start and whether soft requests
were specified with the job:
(1) In case of now assignments (DISPATCH_TIME_NOW) only the first queue
suitable for jobs without soft requests is tagged. When soft requests
are specified all queues must be verified and tagged in order to find
the queue that fits best.
(2) In case of reservation assignments (DISPATCH_TIME_QUEUE_END) the earliest
time is searched when the resources of global/host/queue are sufficient
for the job. The time-wise iteration is then done for each single resources
instance.
Actually there are cases when iterating through all queues were not
needed: (a) if there was a global limitation search could stop once
a queue is found that causes no further delay (b) if job has
a soft request search could stop once a queue is found with minimum (=0)
soft violations.
sge_assignment_t *assignment - job info structure
dispatch_t - 0 ok got an assignment
start time(s) and slots are tagged
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: sequential_tag_queues_suitable4job() is not MT safe
sge_call_pe_qsort() -- call the Parallel Environment qsort plug-in
void sge_call_pe_qsort(sge_assignment_t *a, const char *qsort_args)
sge_assignment_t *a - PE assignment
qsort_args - the PE qsort_args attribute
MT-NOTE: sge_call_pe_qsort() is not MT safe
sge_create_load_list() -- create the control structure for consumables as
load thresholds
void sge_create_load_list(const lList *queue_list, const lList
*host_list, const lList *centry_list, lList **load_list)
scans all queues for consumables as load thresholds. It builds a
consumable category for each queue which is using consumables as a load
threshold.
If no consumables are used, the *load_list is set to NULL.
const lList *queue_list - a list of queue instances
const lList *host_list - a list of hosts
const lList *centry_list - a list of complex entries
lList **load_list - a ref to the target load list
MT-NOTE: sge_create_load_list() is MT safe
sge_free_load_list() -- frees the load list and sets it to NULL
void sge_free_load_list(lList **load_list)
lList **load_list - the load list
MT-NOTE: sge_free_load_list() is MT safe
sge_host_match_static() -- Static test whether job fits to host
static int sge_host_match_static(lListElem *job, lListElem *ja_task,
lListElem *host, lList *centry_list, lList *acl_list)
lListElem *job - ???
lListElem *ja_task - ???
lListElem *host - ???
lList *centry_list - ???
lList *acl_list - ???
int - 0 ok
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
sge_load_list_alarm() -- checks if queues went into an alarm state
bool sge_load_list_alarm(lList *load_list, const lList *host_list, const
lList *centry_list)
The function uses the cull bitfield to identify modifications in one of
the consumable elements. If the consumption has changed, the load for all
queue referencing the consumable is recomputed. If a queue exceeds it
load threshold, QU_tagged4schedule is set to 1.
lList *load_list - ???
const lList *host_list - ???
const lList *centry_list - ???
bool - true, if at least one queue was set into alarm state
MT-NOTE: sge_load_list_alarm() is MT safe
sge_queue_match_static() -- Do matching that depends not on time.
static int sge_queue_match_static(lListElem *queue, lListElem *job,
const lListElem *pe, const lListElem *ckpt, lList *centry_list, lList
*host_list, lList *acl_list)
Checks if a job fits on a queue or not. All checks that depend on the
current load and resource situation must get handled outside.
The queue also gets tagged in QU_tagged4schedule to indicate whether it
is specified using -masterq queue_list.
lListElem *queue - The queue we're matching
lListElem *job - The job
const lListElem *pe - The PE object
const lListElem *ckpt - The ckpt object
lList *centry_list - The centry list
lList *acl_list - The ACL list
dispatch_t - DISPATCH_OK, ok
DISPATCH_NEVER_CAT, assignment will never be possible for all jobs of that category
sge_remove_queue_from_load_list() -- removes queues from the load list
void sge_remove_queue_from_load_list(lList **load_list, const lList
*queue_list)
lList **load_list - load list structure
const lList *queue_list - queues to be removed from it.
MT-NOTE: sge_remove_queue_from_load_list() is MT safe
sge_select_queue() -- checks whether a job matches a given queue or host
int sge_select_queue(lList *requested_attr, lListElem *queue, lListElem
*host, lList *exechost_list, lList *centry_list, bool
allow_non_requestable, int slots)
Takes the requested attributes from a job and checks if it matches the given
host or queue. One and only one should be specified. If both, the function
assumes, that the queue belongs to the given host.
lList *requested_attr - list of requested attributes
lListElem *queue - current queue or null if host is set
lListElem *host - current host or null if queue is set
lList *exechost_list - list of all hosts in the system
lList *centry_list - system wide attribute config list
bool allow_non_requestable - allow non requestable?
int slots - number of requested slots
lList *queue_user_list - list of users or null
lList *acl_list - acl_list or null
lListElem *job - job or null
int - 1, if okay, QU_tag will be set if a queue is selected
0, if not okay
The caller is responsible for cleaning tags.
No range is used. For serial jobs we will need a call for hard and one
for soft requests. For parallel jobs we will call this function for each
-l request. Because of in serial jobs requests can be simply added.
In Parallel jobs each -l requests a different set of queues.
sge_sequential_assignment() -- Make an assignment for a sequential job.
int sge_sequential_assignment(sge_assignment_t *assignment)
For sequential job assignments all the earliest job start time
is determined with each queue instance and the earliest one gets
chosen. Secondary criterion for queue selection minimizing jobs
soft requests.
The overall behaviour of this function is somewhat dependent on the
value that gets passed to assignment->start and whether soft requests
were specified with the job:
(1) In case of now assignments (DISPATCH_TIME_NOW) only the first queue
suitable for jobs without soft requests is tagged. When soft requests
are specified all queues must be verified and tagged in order to find
the queue that fits best. On success the start time is set
(2) In case of queue end assignments (DISPATCH_TIME_QUEUE_END)
sge_assignment_t *assignment - ???
int - 0 ok got an assignment + time (DISPATCH_TIME_NOW and DISPATCH_TIME_QUEUE_END)
1 no assignment at the specified time
-1 assignment will never be possible for all jobs of that category
-2 assignment will never be possible for that particular job
MT-NOTE: sge_sequential_assignment() is not MT safe
sge_split_queue_slots_free() -- ???
int sge_split_queue_slots_free(lList **free, lList **full)
Split queue list into queues with at least one slots and queues with
less than one free slot. The list optionally returned in full gets the
QNOSLOTS queue instance state set.
lList **free - Input queue instance list and return free slots.
lList **full - If non-NULL the full queue instances get returned here.
int - 0 success
-1 error
print_hdr() -- print a header for the sharetree dump
void
print_hdr(dstring *out, const format_t *format)
Prints a header for data output using the sge_sharetree_print function.
dstring *out - dstring into which data will be written
const format_t *format - format description
MT-NOTE: print_hdr() is MT-safe
sge_sharetree_print() -- dump sharetree information to a dstring
void sge_sharetree_print(dstring *out, lList *sharetree, lList *users,
lList *projects, lList *config,
bool group_nodes, bool decay_usage,
const char **names, const format_t *format)
Dumps information about a sharetree into a given dstring. Information
is appended.
Outputs information like times, node (user/project) names, configured
shares, actually received shares, targeted shares, usage information
like cpu, memory and io.
It is possible to restrict the number of fields that are output.
Header information and formatting can be configured.
dstring *out - dstring into which data will be written
lList *sharetree - the sharetree to dump
lList *users - the user list
lList *projects - the project list
lList *config - the scheduler configuration list
bool group_nodes - ???
bool decay_usage - ???
const char **names - fields to output
const format_t *format - format description
MT-NOTE: sge_sharetree_print() is MT-safe
sge_do_urgency() -- Compute normalized urgency
void sge_do_urgency(u_long32 now, lList *running_jobs, lList
*pending_jobs, sge_Sdescr_t *lists)
Determine normalized urgency for all job lists passed:
* for the pending jobs we need it for determine dispatch order
* for the running jobs it is needed when running jobs priority must
be compared with pending jobs (preemption only)
u_long32 now - Current time
lList *running_jobs - The running jobs list
lList *pending_jobs - The pending jobs list
sge_Sdescr_t *lists - Additional config information
sge_normalize_urgency() -- Computes normalized urgency for job list
static void sge_normalize_urgency(lList *job_list, double
min_urgency, double max_urgency)
The normalized urgency is determined for a list of jobs based on the
min/max urgency values passed and the JB_urg value of each job.
lList *job_list - The job list
double min_urgency - minimum urgency value
double max_urgency - maximum urgency value
MT-NOTES: sge_normalize_urgency() is MT safe
sge_normalize_value() -- Returns normalized value with passed value range
double sge_normalize_value(double value, double range_min, double
range_max)
The value passed is normalized and resulting value (0.0-1.0) is returned
The value range passed is assumed. In case there is no range because
min/max are (nearly) equal 0.5 is returned.
double value - Value to be normalized.
double range_min - Range minimum value.
double range_max - Range maximum value.
double - Normalized value (0.0-1.0)
MT-NOTE: sge_normalize_value() is MT safe
sge_urgency() -- Determine urgency value for a list of jobs
static void sge_urgency(u_long32 now, double *min_urgency,
double *max_urgency, lList *job_list, const lList *centry_list,
const lList *pe_list)
The urgency value is determined for all jobs in job_list. The urgency
value has two time dependent components (waiting time contribution and
deadline contribution) and a resource request dependent component. Only
resource requests that apply to the job irrespective what resources it
gets assigned finally are considered. Default requests specified for
consumable resources are not considered as they are placement dependent.
For the same reason soft request do not contribute to the urgency value.
The urgency value range is tracked via min/max urgency. Category-based
caching is used for the resource request urgency contribution.
u_long32 now - Current time
double *min_urgency - For tracking minimum urgency value
double *max_urgency - For tracking minimum urgency value
lList *job_list - The jobs.
const lList *centry_list - Needed for per resource urgency setting.
const lList *pe_list - Needed to determine urgency slot setting.
build_functional_categories() -- sorts the pending jobs into functional categories
void build_functional_categories(sge_ref_t *job_ref, int num_jobs,
sge_fcategory_t **root, int dependent)
Generates a list of functional categories. Each category contains a list of jobs
which belongs to this category. A functional category is assembled of:
- job shares
- user shares
- department shares
- project shares
Alljobs with the same job, user,... shares are put in the same fcategory.
sge_ref_t *job_ref - array of pointers to the job reference structure
int num_jobs - number of elements in the job_ref array
sge_fcategory_t **root - root pointer to the functional category list
sge_ref_list_t ** ref_array - has to be a pointer to NULL pointer. The memory
will be allocated
in this function and freed with free_fcategories.
int dependent - does the functional tickets depend on prior computed tickets?
u_long32 job_tickets - job field, which has the tickets ( JB>_jobshare, JB_override_tickets)
u_long32 up_tickets - source for the user/department tickets/shares (UP_fshare, UP_otickets)
u_long32 dp_tickets - source for the department tickets/shares (US_fshare, US_oticket)
u_long32 - number of jobs in the categories
- job classes are ignored.
IMPROVEMENTS:
- the stored values in the functional category structure can be used to speed up the
ticket calculation. This will avoid unnecessary CULL accesses in the function
calc_job_functional_tickets_pass1
- A further improvement can be done by:
- limiting the job list length in each category to the max nr of jobs calculated
- Sorting the jobs in each functional category by its job category. Each resulting
job list can be of max size of open slots. This will result in a correct ftix result
for all jobs, which might be scheduled.
???
calc_intern_pending_job_functional_tickets() -- calc ftix for pending jobs
void calc_intern_pending_job_functional_tickets(sge_fcategory_t *current,
double sum_of_user_functional_shares,
double sum_of_project_functional_shares,
double sum_of_department_functional_shares,
double sum_of_job_functional_shares,
double total_functional_tickets,
double weight[])
This is an optimized and incomplete version of calc_pending_job_functional_tickets.
It is good enough to get the order right within the inner loop of the ftix
calculation.
sge_fcategory_t *current - current fcategory
double sum_of_user_functional_shares
double sum_of_project_functional_shares
double sum_of_department_functional_shares
double sum_of_job_functional_shares
double total_functional_tickets
double weight[] - destribution of the shares to each other
be carefull using it
???
calculate_pending_shared_override_tickets() -- calculate shared override tickets
static void calculate_pending_shared_override_tickets(sge_ref_t *job_ref,
int num_jobs, int dependent)
We calculate the override tickets for pending jobs, which are shared. The basic
algorithm looks like this:
do for each pending job
do for each pending job which isn't yet considered active
consider the job active
calculate override tickets for that job
consider the job not active
end do
consider the job with the highest priority (taking into account all previous polices + override tickets) as active
end do
set all pending jobs none active
Since this algorithm is very expensive, we split all pending jobs into fcategories. The algorithm changes to:
max_jobs = build fcategories and ignore jobs, which would get 0 override tickets
do for max_jobs pending job
do for each fcategory
take take first job from category
consider the job active
calculate override tickets for that job
consider the job not active
store job with the most override tickets = job_max
end do
set job_max active and remove it from its fcategory.
remove job_max fcategory, if job_max was the last job
end;
set all pending jobs none active
That's it. It is very simillar to the functional ticket calculation, except, that we are working with tickts and
not with shares.
sge_ref_t *job_ref - an array of job structures (first running, than pennding)
int num_jobs - number of jobs in the array
int dependent - do other ticket policies depend on this one?
MT-NOTE: calculate_pending_shared_override_tickets() is MT safe
copy_ftickets() -- copy the ftix from one job to an other one
void copy_ftickets(sge_ref_list_t *source, sge_ref_list_t *dest)
Copy the functional tickets and ref fields used for ftix calculation
from one job to an other job.
sge_ref_list_t *source - source job
sge_ref_list_t *dest - dest job
???
destribute_ftickets() -- ensures, that all jobs have ftix asoziated with them.
void destribute_ftickets(sge_fcategory_t *root, int dependent)
After the functional tickets are calculated, only the first job in the fcategory
job list has ftix. This function copies the result from the first job to all
other jobs in the same list and sums the job ticket count with the ftix.
sge_fcategory_t *root - fcategory list
int dependent - does the final ticket count depend on ftix?
- This function is only needed, because not all functional tickets are calculated
and to give a best guess result, all jobs in one category with no ftix get the
same amount of ftix.
free_fcategories() -- frees all fcategories and their job lists.
void free_fcategories(sge_fcategory_t **fcategories)
frees all fcategories and their job lists.
sge_fcategory_t **fcategories /- pointer to a pointer of the first fcategory
sge_ref_list_t **ref_array - memory for internal structures, allocated with
build_functional_categories. Needs to be freed as well.
- it does not delete the sge_ref_t structures, which are stored in
in the job lists.
recompute_prio() -- Recompute JAT prio based on changed ticket amount
static void recompute_prio(sge_task_ref_t *tref, lListElem *task, double
nurg)
Each time when the ticket amount for in a JAT_Type element is changed
the JAT_prio needs to be updated. The new ticket value is normalized
and the priorty value is computed.
sge_task_ref_t *tref - The tref element that is related to the ticket change
lListElem *task - The JAT_Type task element.
double nurg - The normalized urgency assumed for the job.
double npri - The normalized POSIX priority assumed for the job.
sge_build_sgeee_orders() -- build orders for updating qmaster
void sge_build_sgeee_orders(sge_Sdescr_t *lists, lList *running_jobs,
lList *queued_jobs, lList *finished_jobs, order_t *orders, int
update_usage_and_configuration, int seqno)
Builds generates the orderlist for sending the scheduling decisions
to the qmaster. The following orders are generated:
- running job tickets
- pending job tickets
- delete order for finished jobs
- update user usage order
- update project usage order
- update share tree order
- update scheduler configuration order
- orders updating user/project resource usage (ORT_update_project_usage)
- orders updating running tickets needed for dynamic repriorization (ORT_ticket)
Most orders are generated by using the sge_create_orders function.
sge_Sdescr_t *lists - ???
lList *running_jobs - list of running jobs
lList *queued_jobs - list of queued jobs (should be sorted by ticktes)
lList *finished_jobs - list of finished jobs
order_t *orders - existing order list (new orders will be added to it
bool update_usage_and_configuration - if true, the update usage orders are generated
int seqno - a seqno, changed with each scheduling run
bool max_queued_ticket_orders - if true, pending tickets are submited to the
qmaster
bool updated_execd - if true, the queue information is send with
the running job tickets
void
sge_do_sgeee_priority() -- determine GEEE priority for a list of jobs
static void sge_do_sgeee_priority(lList *job_list, double min_tix, double
max_tix)
Determines for a list of jobs the GEEE priority. Prior
sge_do_sgeee_priority() can be called the normalized urgency value must
already be known for each job. The ticket range passed is used for
normalizing ticket amount.
lList *job_list - The job list
double min_tix - Minumum ticket amount
double max_tix - Maximum ticket amount
bool do_nprio - Needs norm. priority be determined
bool do_nurg - Needs norm. urgency be determined
MT-NOTE: sge_do_sgeee_priority() is MT safe
sgeee_priority() -- Compute final GE priority
static void sgeee_priority(lListElem *task, u_long32 jobid, double nsu,
double min_tix, double max_tix)
The GE priority is computed for the task based on the already known
ticket amount and already normalized urgency value. The ticket amount
is normalized based on the ticket range passed. The weights for
ticket and urgency value are applied.
lListElem *task - The task whose priority is computed
u_long32 jobid - The jobs id
double nsu - The normalized urgency value that applies to all
tasks of the job.
double min_tix - minimum ticket amount
double max_tix - maximum ticket amount
MT-NOTE: sgeee_priority() is MT safe
sgeee_resort_pending_jobs() -- Resort pending jobs after assignment
void sgeee_resort_pending_jobs(lList **job_list, lList *orderlist)
Update pending jobs order upon assignement and change ticket amounts
in orders previously created.
If we dispatch a job sub-task and the job has more sub-tasks, then
the job is still first in the job list.
We need to remove and reinsert the job back into the sorted job
list in case another job is higher priority (i.e. has more tickets)
Additionally it is neccessary to update the number of pending tickets
for the following pending array task. (The next task will get less
tickets than the current one)
lList **job_list - The pending job list. The first job in the list was
assigned right before.
sgeee_scheduler() -- calc tickets, send orders, and sort job list
int sgeee_scheduler(sge_Sdescr_t *lists, lList *running_jobs, lList
*finished_jobs, lList *pending_jobs, lList **orderlist)
- calculates the running and pending job tickets.
- send the orders to the qmaster about the job tickets
- order the pending job list according the the job tickets
On a "normal" scheduling interval:
- calculate tickets for new and running jobs
- don't decay and sum usage
- don't update qmaster
On a scheduling interval:
- calculate tickets for new and running jobs
- decay and sum usage
- handle finished jobs
- update qmaster
sge_Sdescr_t *lists - a ref to all lists in this scheduler
lList *running_jobs - a list of all running jobs
lList *finished_jobs - a list of all finished jobs
lList *pending_jobs - a list of all pending jobs
lList **orderlist - the order list
int - 0 if everthing went fine, -1 if not
tix_range_get() -- Get stored ticket range.
static void tix_range_get(double *min_tix, double *max_tix)
Get stored ticket range from global variables.
double *min_tix - Target for minimum value.
double *max_tix - Target for maximum value.
MT-NOTES: tix_range_get() is not MT safe
tix_range_set() -- Store ticket range.
static void tix_range_set(double min_tix, double max_tix)
Stores ticket range in the global variables.
double min_tix - Minimum ticket value.
double max_tix - Maximum ticket value.
MT-NOTES: tix_range_set() is not MT safe
sge_ar_queue_have_users_access() -- verify that all users of an AR have queue
access
bool sge_ar_queue_have_users_access(lList **alpp, lListElem *ar, lListElem
*queue, lList *master_userset_list)
Iterates over the AR_acl_list and proves that every entry has queue access.
If only one has no access the function returns false
lList **alpp - answer list
lListElem *ar - advance reservation object (AR_Type)
lListElem *queue - queue instance object (QU_Type)
lList *master_userset_list - master userset list
bool - true if all have access
false if only one has no access
MT-NOTE: sge_ar_queue_have_users_access() is MT safe
--Simple-Scheduler-Interface: schedlib ssi --Simple-Scheduler-Interface-SERF_Implementation: SERF -SERF_Implementation-SERF_Interface: SERF -SERF_Interface-Simple-Scheduler-Interface-Typedefs: schedlib ssi -Simple-Scheduler-Interface-Typedefsaccess_cq_rejected: sge_select_queue access_cq_rejectedadd_calendar_to_schedule: sge_resource_utilization add_calendar_to_scheduleadd_job_utilization: sge_resource_utilization add_job_utilizationadd_pe_slots_to_category: sge_select_queue add_pe_slots_to_categorybuild_functional_categories: sgeee build_functional_categoriesbuild_name_filter: sge_complex_schedd build_name_filtercalc_intern_pending_job_functional_tickets: sgeee calc_intern_pending_job_functional_ticketscalculate_pending_shared_override_tickets: sgeee calculate_pending_shared_override_ticketscheck_and_debit_rqs_slots: sge_resource_quota_schedd check_and_debit_rqs_slotsclean_up_parallel_job: sge_select_queue clean_up_parallel_jobclear_resource_tags: sge_select_queue clear_resource_tagscompute_soft_violations: sge_select_queue compute_soft_violationscopy_ftickets: sgeee copy_fticketscqueue_match_static: sge_select_queue cqueue_match_staticcqueue_shadowed: sge_resource_quota_schedd cqueue_shadowedcqueue_shadowed_by: sge_resource_quota_schedd cqueue_shadowed_bydebit_job_from_rqs: sge_resource_quota_schedd debit_job_from_rqsdestribute_ftickets: sgeee destribute_fticketsfill_category_use_t: sge_select_queue fill_category_use_tfree_fcategories: sgeee free_fcategoriesget_attribute: sge_select_queue get_attributeget_attribute_by_Name: sge_select_queue get_attribute_by_Nameget_attribute_list: sge_complex_schedd get_attribute_listget_attribute_list_by_names: sge_complex_schedd get_attribute_list_by_namesget_name_of_split_value: sched sge_job_schedd get_name_of_split_valueget_queue_resource: sge_select_queue get_queue_resourcehost_shadowed: sge_resource_quota_schedd host_shadowedhost_shadowed_by: sge_resource_quota_schedd host_shadowed_byhost_time_by_slots: sge_select_queue host_time_by_slotsinteractive_cq_rejected: sge_select_queue interactive_cq_rejectedis_attr_prior: sge_select_queue is_attr_prioris_attr_prior2: sge_complex_schedd is_attr_prior2is_cqueue_expand: sge_resource_quota_schedd is_cqueue_expandis_cqueue_global: sge_resource_quota_schedd is_cqueue_globalis_host_expand: sge_resource_quota_schedd is_host_expandis_host_global: sge_resource_quota_schedd is_host_globalis_requested: sge_select_queue is_requestedjob_get_duration: sched sge_job_schedd job_get_durationjob_lists_split_with_reference_to_max_running: sched sge_job_schedd job_lists_split_with_reference_to_max_runningjob_move_first_pending_to_running: sched sge_job_schedd job_move_first_pending_to_runningload_locate_elem: sge_select_queue load_locate_elemload_np_value_adjustment: sge_select_queue load_np_value_adjustmentmatch_static_advance_reservation: sge_select_queue match_static_advance_reservationnewResourceElem: sge_resource_utilization newResourceElemorder_remove_immediate: SCHEDD order_remove_immediateorder_remove_order_and_immediate: SCHEDD order_remove_order_and_immediateparallel_assignment: sge_select_queue parallel_assignmentparallel_available_slots: sge_select_queue parallel_available_slotsparallel_global_slots: sched select_queue parallel_global_slotsparallel_host_slots: sge_select_queue parallel_host_slotsparallel_limit_slots_by_time: sge_resource_quota_schedd parallel_limit_slots_by_timeparallel_maximize_slots_pe: scheduler parallel_maximize_slots_peparallel_queue_slots: sched select_queue parallel_queue_slotsparallel_reservation_max_time_slots: scheduler parallel_reservation_max_time_slotsparallel_rqs_slots_by_time: sge_resource_quota_schedd parallel_rqs_slots_by_timeparallel_tag_hosts_queues: sge_select_queue parallel_tag_hosts_queuesparallel_tag_queues_suitable4job: sge_select_queue parallel_tag_queues_suitable4jobpe_cq_rejected: sge_select_queue pe_cq_rejectedpe_match_static: sge_pe_schedd pe_match_staticprepare_resource_schedules: sge_resource_utilization prepare_resource_schedulesprint_hdr: sge_sharetree_printing print_hdrproject_cq_rejected: sge_select_queue project_cq_rejectedrc_time_by_slots: sge_select_queue rc_time_by_slotsrecompute_prio: sgeee recompute_prioremove_immediate_job: SCHEDD remove_immediate_jobremove_immediate_jobs: SCHEDD remove_immediate_jobsrequest_cq_rejected: sge_complex_schedd request_cq_rejectedri_slots_by_time: sge_select_queue ri_slots_by_timeri_time_by_slots: sge_select_queue ri_time_by_slotsrqs_add_job_utilization: sge_resource_utilization rqs_add_job_utilizationrqs_by_slots: sge_resource_quota_schedd rqs_by_slotsrqs_can_optimize: sge_resource_quota_schedd rqs_can_optimizerqs_exceeded_sort_out: sge_resource_quota_schedd rqs_exceeded_sort_outrqs_exceeded_sort_out_par: sge_resource_quota_schedd rqs_exceeded_sort_out_parrqs_excluded_cqueues: sge_resource_quota_schedd rqs_excluded_cqueuesrqs_excluded_hosts: sge_resource_quota_schedd rqs_excluded_hostsrqs_expand_cqueues: sge_resource_quota_schedd rqs_expand_cqueuesrqs_expand_hosts: sge_resource_quota_schedd rqs_expand_hostsrqs_limitation_reached: sge_resource_quota_schedd rqs_limitation_reachedrqs_match_assignment: sge_resource_quota_schedd rqs_match_assignmentrqs_set_dynamical_limit: sge_resource_quota_schedd rqs_set_dynamical_limitschedd_mes_add: schedd schedd_mes schedd_mes_addschedd_mes_add_global: schedd schedd_mes schedd_mes_add_globalschedd_mes_add_join: schedd_message schedd_mes_add_joinschedd_mes_commit: schedd schedd_mes schedd_mes_commitschedd_mes_get_tmp_list: schedd_message schedd_mes_get_tmp_listschedd_mes_initialize: schedd schedd_mes schedd_mes_initializeschedd_mes_obtain_package: schedd schedd_mes schedd_mes_obtain_packageschedd_mes_rollback: schedd schedd_mes schedd_mes_rollbackschedd_mes_set_tmp_list: schedd_message schedd_mes_set_tmp_listsequential_global_time: sched select_queue sequential_global_timesequential_queue_time: sched select_queue sequential_queue_timesequential_tag_queues_suitable4job: sge_select_queue sequential_tag_queues_suitable4jobserf_exit: sge_resource_utilization serf_exitserf_init: sge_resource_utilization serf_initserf_new_interval: sge_resource_utilization serf_new_intervalserf_record_entry: sge_resource_utilization serf_record_entryset_utilization: sge_resource_utilization set_utilizationsge_add_schedd_info: sge_orders sge_add_schedd_infosge_ar_queue_have_users_access: valid_queue_user sge_ar_queue_have_users_accesssge_build_sgeee_orders: sgeee sge_build_sgeee_orderssge_call_pe_qsort: sge_select_queue sge_call_pe_qsortsge_create_load_list: sge_select_queue sge_create_load_listsge_create_orders: sge_orders sge_create_orderssge_dlib: sge_dlibsge_do_sgeee_priority: sgeee sge_do_sgeee_prioritysge_do_urgency: sge_urgency sge_do_urgencysge_free_load_list: sge_select_queue sge_free_load_listsge_get_schedd_text: sge_schedd_text sge_get_schedd_textsge_GetNumberOfOrders: sge_orders sge_GetNumberOfOrderssge_host_match_static: sge_select_queue sge_host_match_staticsge_job_slot_request: sge_job_schedd sge_job_slot_requestsge_join_orders: sge_orders sge_join_orderssge_load_list_alarm: sge_select_queue sge_load_list_alarmsge_normalize_urgency: sge_urgency sge_normalize_urgencysge_normalize_value: sge_urgency sge_normalize_valuesge_qeti_first: sge_resource_utilization sge_qeti_firstsge_qeti_list_add: sge_qeti sge_qeti_list_addsge_qeti_next: sge_resource_utilization sge_qeti_nextsge_qeti_next_before: sge_qeti sge_qeti_next_beforesge_qeti_release: sge_resource_utilization sge_qeti_releasesge_queue_match_static: sge_select_queue sge_queue_match_staticsge_remove_queue_from_load_list: sge_select_queue sge_remove_queue_from_load_listsge_select_parallel_environment: scheduler sge_select_parallel_environmentsge_select_queue: sge_select_queue sge_select_queuesge_sequential_assignment: sge_select_queue sge_sequential_assignmentsge_sharetree_print: sge_sharetree_printing sge_sharetree_printsge_split_queue_slots_free: sge_select_queue sge_split_queue_slots_freesge_ssi_job_cancel: schedlib ssi sge_ssi_job_cancelsge_ssi_job_start: schedlib ssi sge_ssi_job_startsge_urgency: sge_urgency sge_urgencysge_user_is_referenced_in_rqs: sge_resource_quota_schedd sge_user_is_referenced_in_rqssgeee_priority: sgeee sgeee_prioritysgeee_resort_pending_jobs: sgeee sgeee_resort_pending_jobssgeee_scheduler: sgeee sgeee_schedulerSPLIT_-Constants: sched sge_job_schedd SPLIT_-Constantssplit_jobs: sched sge_job_schedd split_jobstask_get_duration: sge_job_schedd task_get_durationtix_range_get: sgeee tix_range_gettix_range_set: sgeee tix_range_settrash_splitted_jobs: sched sge_job_schedd trash_splitted_jobsuser_list_init_jc: sched sge_job_schedd user_list_init_jcutilization_add: sge_resource_utilization utilization_addutilization_below: sge_resource_utilization utilization_belowutilization_max: sge_resource_utilization utilization_maxutilization_print_to_dstring: sge_resource_utilization utilization_print_to_dstringutilization_queue_end: sge_resource_utilization utilization_queue_end