[Starcluster] cannot start cluster with m2.4xlarge
Justin Riley
jtriley at MIT.EDU
Sat Apr 24 21:59:10 EDT 2010
Hi Damian,
My bad, typo in the last commit. Should be fixed in github now.
~Justin
Quoting Damian Eads <eads at soe.ucsc.edu>:
> Hi,
>
> I've tried starting a three node cluster with the biggest instance
> type (m2.4xlarge) and it crashes here. This is with Justin's latest
> git. Ideas?
>
> Thanks,
>
> Damian
>
>
> eads at argentina:~/work/repo/StarCluster$ starcluster start -x mycluster dtest
> StarCluster - (http://web.mit.edu/starcluster)
> Software Tools for Academics and Researchers (STAR)
> Please submit bug reports to starcluster at mit.edu
>
>>>> Validating cluster settings...
>>>> Cluster settings are valid
>>>> Starting cluster...
>>>> Waiting for cluster to start...
>>>> The master node is ec2-174-129-138-92.compute-1.amazonaws.com
>>>> Attaching volume vol-1dbc0c74 to master node...
>>>> Setting up the cluster...
>>>> Mounting EBS volume vol-1dbc0c74 on /data...
> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
> trying both
>>>> Using private key /home/eads/deadskey.pem (rsa)
>>>> Creating cluster user: sgeadmin
> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
> trying both
>>>> Using private key /home/eads/deadskey.pem (rsa)
> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
> trying both
>>>> Using private key /home/eads/deadskey.pem (rsa)
>>>> Configuring scratch space for user: sgeadmin
>>>> Configuring /etc/hosts on each node
>>>> Configuring NFS...
> ERROR: An unexpected error occurred while tokenizing input
> The following traceback may be corrupted or invalid
> The error message is: ('EOF in multi-line statement', (405, 0))
>
> ---------------------------------------------------------------------------
> NameError Traceback (most recent call last)
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
> in <module>()
> 3 __requires__ = 'StarCluster==0.9999'
> 4 import pkg_resources
> ----> 5 pkg_resources.run_script('StarCluster==0.9999', 'starcluster')
> 6
> 7
>
> /usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
> requires, script_name)
> 446 ns.clear()
> 447 ns['__name__'] = name
> --> 448 self.require(requires)[0].run_script(script_name, ns)
> 449
> 450
>
> /usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
> script_name, namespace)
> 1171 )
> 1172 script_code = compile(script_text,script_filename,'exec')
> -> 1173 exec script_code in namespace, namespace
> 1174
> 1175 def _has(self, path):
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
> in <module>()
> 4
> 5
> ----> 6
> 7
> 8
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
> in main()
> 918 sys.exit(0)
> 919 try:
> --> 920 sc.execute(args)
> 921 except exception.BaseException,e:
> 922 log.error(e.msg)
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
> in execute(self, args)
> 179 log.info('Cluster settings are valid')
> 180 if not self.opts.validate_only:
> --> 181 scluster.start(create=not self.opts.no_create)
> 182 if self.opts.login_master:
> 183 cluster.ssh_to_master(tag, self.cfg)
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/utils.pyc
> in wrapper(*arg, **kargs)
> 24 """Raw timing function """
> 25 time1 = time.time()
> ---> 26 res = func(*arg, **kargs)
> 27 time2 = time.time()
> 28 log.info('%s took %0.3f mins' % (func.func_name,
> (time2-time1)/60.0))
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cluster.pyc
> in start(self, create)
> 512 self.nodes, self.master_node,
> 513 self.cluster_user, self.cluster_shell,
> --> 514 self.volumes
> 515 )
> 516 self.create_receipt()
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
> in run(self, nodes, master, user, user_shell, volumes)
> 334 self._setup_scratch()
> 335 self._setup_etc_hosts()
> --> 336 self._setup_nfs()
> 337 self._setup_passwordless_ssh()
> 338 self._setup_sge()
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
> in _setup_nfs(self)
> 245 mconn.execute('/etc/init.d/nfs start')
> 246 mconn.execute('/usr/sbin/exportfs -r')
> --> 247 mconn.execute('mount -t devpts none /dev/pts') # fix
> for xterm/mpi printing to stdout
> 248
> 249 # setup /etc/fstab and mount /home and /opt/sge6 on each node
>
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/ssh.pyc
> in execute(self, command, silent, only_printable, ignore_exit_status)
> 221 exit_status = channel.recv_exit_status()
> 222 if exit_status != 0:
> --> 223 if not ignore_exist_status:
> 224 log.error("command %s failed with status %d" %
> (command,
> 225
> exit_status))
>
>
> --
> -----------------------------------------------------
> Damian Eads Ph.D. Candidate
> University of California Computer Science
> 1156 High Street Machine Learning Lab, E2-489
> Santa Cruz, CA 95064 http://www.soe.ucsc.edu/~eads
> _______________________________________________
> Starcluster mailing list
> Starcluster at mit.edu
> http://mailman.mit.edu/mailman/listinfo/starcluster
>
More information about the StarCluster
mailing list