Jobs on hold while trying to submit madgraph jobs to condor cluster

Asked by William

Hi,

I have this issue while trying to submit a job in the HTCondor cluster:

-- Schedd: xxxxxxxxxxxxxxxxxxxxxxxx : <131.154.140.11:9618?... @ 02/19/19 14:55:46
OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS
wkorcari CMD: survey.sh 2/19 12:53 _ _ _ 5 5 127.0 ... 183.0

5 jobs; 0 completed, 0 removed, 0 idle, 0 running, 5 held, 0 suspended
-bash-4.2$ condor_q -analyze

-- Schedd: xxxxxxxxxxxxxxxxxxxxxxxx : <131.154.140.11:9618?...

127.000: Job is held.

Hold reason: Error from slot1@xxxxxxxxxxxxxxxxxxxxxxxx: STARTER at 131.154.140.87 failed to send file(s) to <131.154.140.11:9618>: error reading from /var/lib/condor/execute/dir_4223/G93: (errno 2) No such file or directory; SHADOW failed to receive file(s) from <131.154.140.87:32585>

172.000: Job is held.

Hold reason: Error from slot1@xxxxxxxxxxxxxxxxxxxxxxxx: STARTER at 131.154.140.87 failed to send file(s) to <131.154.140.11:9618>: error reading from /var/lib/condor/execute/dir_6967/G93: (errno 2) No such file or directory; SHADOW failed to receive file(s) from <131.154.140.87:32089>

HERE is my mg5_configuration.txt settings for cluster:

#! Default Running mode
#! 0: single machine/ 1: cluster / 2: multicore
run_mode = 1

#! Cluster Type [pbs|sge|condor|lsf|ge|slurm|htcaas|htcaas2] Use for cluster run only
#! And cluster queue (or partition for slurm)
#! And size of the cluster (some part of the code can adapt splitting accordingly)
cluster_type = condor
#cluster_queue = madgraph
cluster_size = 9

#! Path to a node directory to avoid direct writing on the central disk
#! Note that condor clusters avoid direct writing by default (therefore this
#! options does not affect condor clusters)
# cluster_temp_path = None

#! path to a node directory where local file can be found (typically pdf)
#! to avoid to send them to the node (if cluster_temp_path is on True or condor)
#cluster_local_path = None # example: /cvmfs/cp3.uclouvain.be/madgraph/

#! Cluster waiting time for status update
#! First number is when the number of waiting job is higher than the number
#! of running one (time in second). The second number is in the second case.
cluster_status_update = 60 30

#! How to deal with failed submission (can occurs on cluster mode)
#! 0: crash, -1: print error, hangs the program up to manual instructions, N(>0) retry up to N times.
cluster_nb_retry = 1

#! How much time to wait for the output file before resubmission/crash (filesystem can be very slow)
cluster_retry_wait = 600

#! Nb_core to use (None = all) This is use only for multicore run
#! This correspond also to the number core used for code compilation for cluster mode
nb_core = 1

Here is the .log file for the run:

generate_events run_01
Traceback (most recent call last):
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1514, in onecmd
    return self.onecmd_orig(line, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
    return func(arg, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2469, in do_generate_events
    self.run_generate_events(switch_mode, args)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/common_run_interface.py", line 6884, in new_fct
    original_fct(obj, *args, **opts)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2508, in run_generate_events
    postcmd=False)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1543, in exec_cmd
    stop = Cmd.onecmd_orig(current_interface, line, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
    return func(arg, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 3322, in do_survey
    self.monitor(run_type='All jobs submitted for survey', html=True)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
    return self.onecmd_orig(line, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
    return func(arg, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2469, in do_generate_events
    self.run_generate_events(switch_mode, args)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/common_run_interface.py", line 6884, in new_fct
    original_fct(obj, *args, **opts)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2508, in run_generate_events
    postcmd=False)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1543, in exec_cmd
    stop = Cmd.onecmd_orig(current_interface, line, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
    return func(arg, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 3322, in do_survey
    self.monitor(run_type='All jobs submitted for survey', html=True)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
    return self.monitor(run_type=run_type, mode=mode, html=html)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5490, in monitor
    self.cluster.wait(self.me_dir, update_status, update_first=update_first)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/various/cluster.py", line 52, in deco_f_interupt
    return f(self, *args, **opt)
  File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/various/cluster.py", line 313, in wait
    raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
ClusterManagmentError: Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team

I hope that somebody can help me figuring this out, thank you.

Cheers,

William.

Question information

Language:
English Edit question
Status:
Answered
For:
MadGraph5_aMC@NLO Edit question
Assignee:
No assignee Edit question
Last query:
Last reply:
Revision history for this message
Olivier Mattelaer (olivier-mattelaer) said :
#1

Hi,

We do not have any local condor where we can test anymore.
But except if you have a very recent version of HTCondor who drop some options, it should work in principle.

This seems to be related to the transfer of the file at the starting time of the job
(to avoid to write directly on the nfs file system).
Can you contact your sys-admin, if such transfer of file are suppose to work on your machine?

In case, You should play with the submission (maybe to remove that option and write directly in the home/central disk) the file to control the cluster are in
madgraph/various/cluster.py

Cheers,

Olivier

> On 19 Feb 2019, at 15:09, William <email address hidden> wrote:
>
> New question #678661 on MadGraph5_aMC@NLO:
> https://answers.launchpad.net/mg5amcnlo/+question/678661
>
> Hi,
>
> I have this issue while trying to submit a job in the HTCondor cluster:
>
> -- Schedd: xxxxxxxxxxxxxxxxxxxxxxxx : <131.154.140.11:9618?... @ 02/19/19 14:55:46
> OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS
> wkorcari CMD: survey.sh 2/19 12:53 _ _ _ 5 5 127.0 ... 183.0
>
> 5 jobs; 0 completed, 0 removed, 0 idle, 0 running, 5 held, 0 suspended
> -bash-4.2$ condor_q -analyze
>
>
> -- Schedd: xxxxxxxxxxxxxxxxxxxxxxxx : <131.154.140.11:9618?...
>
> 127.000: Job is held.
>
> Hold reason: Error from slot1@xxxxxxxxxxxxxxxxxxxxxxxx: STARTER at 131.154.140.87 failed to send file(s) to <131.154.140.11:9618>: error reading from /var/lib/condor/execute/dir_4223/G93: (errno 2) No such file or directory; SHADOW failed to receive file(s) from <131.154.140.87:32585>
>
>
> 172.000: Job is held.
>
> Hold reason: Error from slot1@xxxxxxxxxxxxxxxxxxxxxxxx: STARTER at 131.154.140.87 failed to send file(s) to <131.154.140.11:9618>: error reading from /var/lib/condor/execute/dir_6967/G93: (errno 2) No such file or directory; SHADOW failed to receive file(s) from <131.154.140.87:32089>
>
>
> HERE is my mg5_configuration.txt settings for cluster:
>
> #! Default Running mode
> #! 0: single machine/ 1: cluster / 2: multicore
> run_mode = 1
>
> #! Cluster Type [pbs|sge|condor|lsf|ge|slurm|htcaas|htcaas2] Use for cluster run only
> #! And cluster queue (or partition for slurm)
> #! And size of the cluster (some part of the code can adapt splitting accordingly)
> cluster_type = condor
> #cluster_queue = madgraph
> cluster_size = 9
>
> #! Path to a node directory to avoid direct writing on the central disk
> #! Note that condor clusters avoid direct writing by default (therefore this
> #! options does not affect condor clusters)
> # cluster_temp_path = None
>
> #! path to a node directory where local file can be found (typically pdf)
> #! to avoid to send them to the node (if cluster_temp_path is on True or condor)
> #cluster_local_path = None # example: /cvmfs/cp3.uclouvain.be/madgraph/
>
> #! Cluster waiting time for status update
> #! First number is when the number of waiting job is higher than the number
> #! of running one (time in second). The second number is in the second case.
> cluster_status_update = 60 30
>
> #! How to deal with failed submission (can occurs on cluster mode)
> #! 0: crash, -1: print error, hangs the program up to manual instructions, N(>0) retry up to N times.
> cluster_nb_retry = 1
>
> #! How much time to wait for the output file before resubmission/crash (filesystem can be very slow)
> cluster_retry_wait = 600
>
> #! Nb_core to use (None = all) This is use only for multicore run
> #! This correspond also to the number core used for code compilation for cluster mode
> nb_core = 1
>
>
> Here is the .log file for the run:
>
> generate_events run_01
> Traceback (most recent call last):
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1514, in onecmd
> return self.onecmd_orig(line, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
> return func(arg, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2469, in do_generate_events
> self.run_generate_events(switch_mode, args)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/common_run_interface.py", line 6884, in new_fct
> original_fct(obj, *args, **opts)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2508, in run_generate_events
> postcmd=False)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1543, in exec_cmd
> stop = Cmd.onecmd_orig(current_interface, line, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
> return func(arg, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 3322, in do_survey
> self.monitor(run_type='All jobs submitted for survey', html=True)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> return self.onecmd_orig(line, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
> return func(arg, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2469, in do_generate_events
> self.run_generate_events(switch_mode, args)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/common_run_interface.py", line 6884, in new_fct
> original_fct(obj, *args, **opts)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2508, in run_generate_events
> postcmd=False)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1543, in exec_cmd
> stop = Cmd.onecmd_orig(current_interface, line, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
> return func(arg, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 3322, in do_survey
> self.monitor(run_type='All jobs submitted for survey', html=True)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
> return self.monitor(run_type=run_type, mode=mode, html=html)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5490, in monitor
> self.cluster.wait(self.me_dir, update_status, update_first=update_first)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/various/cluster.py", line 52, in deco_f_interupt
> return f(self, *args, **opt)
> File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/various/cluster.py", line 313, in wait
> raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
> ClusterManagmentError: Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team
>
>
> I hope that somebody can help me figuring this out, thank you.
>
> Cheers,
>
> William.
>
>
> You received this question notification because you are an answer
> contact for MadGraph5_aMC@NLO.

Revision history for this message
Max Fox (maxfo) said (last edit ):
#2

local condor?

Can you help with this problem?

Provide an answer of your own, or ask William for more information if necessary.

To post a message you must log in.