Hi,
I have this issue while trying to submit a job in the HTCondor cluster:
-- Schedd: xxxxxxxxxxxxxxxxxxxxxxxx : <131.154.140.11:9618?... @ 02/19/19 14:55:46
OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS
wkorcari CMD: survey.sh 2/19 12:53 _ _ _ 5 5 127.0 ... 183.0
5 jobs; 0 completed, 0 removed, 0 idle, 0 running, 5 held, 0 suspended
-bash-4.2$ condor_q -analyze
-- Schedd: xxxxxxxxxxxxxxxxxxxxxxxx : <131.154.140.11:9618?...
127.000: Job is held.
Hold reason: Error from slot1@xxxxxxxxxxxxxxxxxxxxxxxx: STARTER at 131.154.140.87 failed to send file(s) to <131.154.140.11:9618>: error reading from /var/lib/condor/execute/dir_4223/G93: (errno 2) No such file or directory; SHADOW failed to receive file(s) from <131.154.140.87:32585>
172.000: Job is held.
Hold reason: Error from slot1@xxxxxxxxxxxxxxxxxxxxxxxx: STARTER at 131.154.140.87 failed to send file(s) to <131.154.140.11:9618>: error reading from /var/lib/condor/execute/dir_6967/G93: (errno 2) No such file or directory; SHADOW failed to receive file(s) from <131.154.140.87:32089>
HERE is my mg5_configuration.txt settings for cluster:
#! Default Running mode
#! 0: single machine/ 1: cluster / 2: multicore
run_mode = 1
#! Cluster Type [pbs|sge|condor|lsf|ge|slurm|htcaas|htcaas2] Use for cluster run only
#! And cluster queue (or partition for slurm)
#! And size of the cluster (some part of the code can adapt splitting accordingly)
cluster_type = condor
#cluster_queue = madgraph
cluster_size = 9
#! Path to a node directory to avoid direct writing on the central disk
#! Note that condor clusters avoid direct writing by default (therefore this
#! options does not affect condor clusters)
# cluster_temp_path = None
#! path to a node directory where local file can be found (typically pdf)
#! to avoid to send them to the node (if cluster_temp_path is on True or condor)
#cluster_local_path = None # example: /cvmfs/cp3.uclouvain.be/madgraph/
#! Cluster waiting time for status update
#! First number is when the number of waiting job is higher than the number
#! of running one (time in second). The second number is in the second case.
cluster_status_update = 60 30
#! How to deal with failed submission (can occurs on cluster mode)
#! 0: crash, -1: print error, hangs the program up to manual instructions, N(>0) retry up to N times.
cluster_nb_retry = 1
#! How much time to wait for the output file before resubmission/crash (filesystem can be very slow)
cluster_retry_wait = 600
#! Nb_core to use (None = all) This is use only for multicore run
#! This correspond also to the number core used for code compilation for cluster mode
nb_core = 1
Here is the .log file for the run:
generate_events run_01
Traceback (most recent call last):
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1514, in onecmd
return self.onecmd_orig(line, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
return func(arg, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2469, in do_generate_events
self.run_generate_events(switch_mode, args)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/common_run_interface.py", line 6884, in new_fct
original_fct(obj, *args, **opts)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2508, in run_generate_events
postcmd=False)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1543, in exec_cmd
stop = Cmd.onecmd_orig(current_interface, line, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
return func(arg, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 3322, in do_survey
self.monitor(run_type='All jobs submitted for survey', html=True)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
return self.onecmd_orig(line, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
return func(arg, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2469, in do_generate_events
self.run_generate_events(switch_mode, args)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/common_run_interface.py", line 6884, in new_fct
original_fct(obj, *args, **opts)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 2508, in run_generate_events
postcmd=False)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1543, in exec_cmd
stop = Cmd.onecmd_orig(current_interface, line, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/extended_cmd.py", line 1463, in onecmd_orig
return func(arg, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 3322, in do_survey
self.monitor(run_type='All jobs submitted for survey', html=True)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5501, in monitor
return self.monitor(run_type=run_type, mode=mode, html=html)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/interface/madevent_interface.py", line 5490, in monitor
self.cluster.wait(self.me_dir, update_status, update_first=update_first)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/various/cluster.py", line 52, in deco_f_interupt
return f(self, *args, **opt)
File "/home/CMS-T3/wkorcari/madgraph/MG5_aMC_v2_6_5/madgraph/various/cluster.py", line 313, in wait
raise ClusterManagmentError('Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team')
ClusterManagmentError: Some Jobs are in a Hold/... state. Please try to investigate or contact the IT team
I hope that somebody can help me figuring this out, thank you.
Cheers,
William.