[Starcluster] cannot start cluster with m2.4xlarge

Justin Riley jtriley at MIT.EDU
Sat Apr 24 21:59:10 EDT 2010


Hi Damian,

My bad, typo in the last commit. Should be fixed in github now.

~Justin

Quoting Damian Eads <eads at soe.ucsc.edu>:

> Hi,
>
> I've tried starting a three node cluster with the biggest instance
> type (m2.4xlarge) and it crashes here. This is with Justin's latest
> git. Ideas?
>
> Thanks,
>
> Damian
>
>
> eads at argentina:~/work/repo/StarCluster$ starcluster start -x mycluster dtest
> StarCluster - (http://web.mit.edu/starcluster)
> Software Tools for Academics and Researchers (STAR)
> Please submit bug reports to starcluster at mit.edu
>
>>>> Validating cluster settings...
>>>> Cluster settings are valid
>>>> Starting cluster...
>>>> Waiting for cluster to start...
>>>> The master node is ec2-174-129-138-92.compute-1.amazonaws.com
>>>> Attaching volume vol-1dbc0c74 to master node...
>>>> Setting up the cluster...
>>>> Mounting EBS volume vol-1dbc0c74 on /data...
> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
> trying both
>>>> Using private key /home/eads/deadskey.pem (rsa)
>>>> Creating cluster user: sgeadmin
> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
> trying both
>>>> Using private key /home/eads/deadskey.pem (rsa)
> ssh.py:66 - WARNING - specified key does not end in either rsa or dsa,
> trying both
>>>> Using private key /home/eads/deadskey.pem (rsa)
>>>> Configuring scratch space for user: sgeadmin
>>>> Configuring /etc/hosts on each node
>>>> Configuring NFS...
> ERROR: An unexpected error occurred while tokenizing input
> The following traceback may be corrupted or invalid
> The error message is: ('EOF in multi-line statement', (405, 0))
>
> ---------------------------------------------------------------------------
> NameError                                 Traceback (most recent call last)
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
> in <module>()
>      3 __requires__ = 'StarCluster==0.9999'
>      4 import pkg_resources
> ----> 5 pkg_resources.run_script('StarCluster==0.9999', 'starcluster')
>      6
>      7
>
> /usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
> requires, script_name)
>    446         ns.clear()
>    447         ns['__name__'] = name
> --> 448         self.require(requires)[0].run_script(script_name, ns)
>    449
>    450
>
> /usr/lib/python2.6/dist-packages/pkg_resources.pyc in run_script(self,
> script_name, namespace)
>   1171             )
>   1172             script_code = compile(script_text,script_filename,'exec')
> -> 1173             exec script_code in namespace, namespace
>   1174
>   1175     def _has(self, path):
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/EGG-INFO/scripts/starcluster
> in <module>()
>      4
>      5
> ----> 6
>      7
>      8
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
> in main()
>    918         sys.exit(0)
>    919     try:
> --> 920         sc.execute(args)
>    921     except exception.BaseException,e:
>    922         log.error(e.msg)
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cli.pyc
> in execute(self, args)
>    179             log.info('Cluster settings are valid')
>    180             if not self.opts.validate_only:
> --> 181                 scluster.start(create=not self.opts.no_create)
>    182                 if self.opts.login_master:
>    183                     cluster.ssh_to_master(tag, self.cfg)
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/utils.pyc
> in wrapper(*arg, **kargs)
>     24         """Raw timing function """
>     25         time1 = time.time()
> ---> 26         res = func(*arg, **kargs)
>     27         time2 = time.time()
>     28         log.info('%s took %0.3f mins' % (func.func_name,
> (time2-time1)/60.0))
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/cluster.pyc
> in start(self, create)
>    512             self.nodes, self.master_node,
>    513             self.cluster_user, self.cluster_shell,
> --> 514             self.volumes
>    515         )
>    516         self.create_receipt()
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
> in run(self, nodes, master, user, user_shell, volumes)
>    334         self._setup_scratch()
>    335         self._setup_etc_hosts()
> --> 336         self._setup_nfs()
>    337         self._setup_passwordless_ssh()
>    338         self._setup_sge()
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/clustersetup.pyc
> in _setup_nfs(self)
>    245         mconn.execute('/etc/init.d/nfs start')
>    246         mconn.execute('/usr/sbin/exportfs -r')
> --> 247         mconn.execute('mount -t devpts none /dev/pts') # fix
> for xterm/mpi printing to stdout
>    248
>    249         # setup /etc/fstab and mount /home and /opt/sge6 on each node
>
>
> /tmp/qqq/lib/python2.6/site-packages/StarCluster-0.9999-py2.6.egg/starcluster/ssh.pyc
> in execute(self, command, silent, only_printable, ignore_exit_status)
>    221         exit_status = channel.recv_exit_status()
>    222         if exit_status != 0:
> --> 223             if not ignore_exist_status:
>    224                 log.error("command %s failed with status %d" % 
> (command,
>    225
> exit_status))
>
>
> --
> -----------------------------------------------------
> Damian Eads                           Ph.D. Candidate
> University of California             Computer Science
> 1156 High Street         Machine Learning Lab, E2-489
> Santa Cruz, CA 95064    http://www.soe.ucsc.edu/~eads
> _______________________________________________
> Starcluster mailing list
> Starcluster at mit.edu
> http://mailman.mit.edu/mailman/listinfo/starcluster
>





More information about the StarCluster mailing list