Difference between revisions of "Mash"
Line 20: | Line 20: | ||
== De novo assembly == | == De novo assembly == | ||
− | We use SPAdes with the --meta option here as we are | + | We use SPAdes with the --meta option here as we are dealing with metagenomes. First we need the python DRMAA script which will control the where and how the job script will be run: |
− | |||
+ | #!/usr/bin/env python2.7 | ||
+ | import os, sys, drmaa | ||
+ | |||
+ | def main(): | ||
+ | """Submit an array job.""" | ||
+ | argquan=len(sys.argv) | ||
+ | if argquan != 4: | ||
+ | print "This script requires two arguments: 1) the script to run in ja mode 2) filelist of absolute paths and filenames 3) Number of threads/CPU for *each* job array" | ||
+ | sys.exit(2) | ||
+ | |||
+ | s = drmaa.Session() | ||
+ | s.initialize() | ||
+ | print 'Creating job template' | ||
+ | jt = s.createJobTemplate() | ||
+ | jt.workingDirectory=os.getcwd() # means sge job output will be deposited here. | ||
+ | jt.remoteCommand = jt.workingDirectory + '/' +sys.argv[1] | ||
+ | |||
+ | with open(sys.argv[2]) as x: fl = x.read().splitlines() | ||
+ | eflsz=len(fl) | ||
+ | PL=[] | ||
+ | for i in xrange(eflsz): | ||
+ | PL.append(fl[i]) | ||
+ | pld2=len(PL)/2 | ||
+ | jt.args =PL | ||
+ | |||
+ | # prepare natSpec | ||
+ | nm='-N jadrm0' | ||
+ | jt.nativeSpecification='-V -pe multi ' +sys.argv[3]+ ' '+nm | ||
+ | # this is an intensive IO job, don't want to whack the FS too much | ||
+ | jt.joinFiles=True | ||
+ | # | ||
+ | # print eflsz | ||
+ | jobid = s.runBulkJobs(jt, 1, pld2, 1) | ||
+ | print 'Your job has been submitted with id ' + str(jobid) | ||
+ | |||
+ | print 'Cleaning up' | ||
+ | s.deleteJobTemplate(jt) | ||
+ | s.exit() | ||
+ | |||
+ | if __name__=='__main__': | ||
+ | main() | ||
= Links = | = Links = | ||
* [https://mash.readthedocs.io/en/latest main Mash documentation] | * [https://mash.readthedocs.io/en/latest main Mash documentation] |
Revision as of 10:24, 8 March 2017
Contents
Introduction
MinHash is a general dimensionality-reduction technique and it is used by Mash to reduce large sequences and sequence sets to small, representative sketches with the result that global mutation distances (Mash distances) can be rapidly estimated.
Other aspects
- terms itself as an alignment-free method
Usage
Typical analysis
Mash is run on genomes. These will usually be de-novo assembled genomes from tools such as Velvet or SPAdes.
Parallel Usage on gridengine
We'll go through a process here of running Mash on a set of samples, using the DRMAA library to launch Gridengine job arrays.
The scripts will take as argument a file listing of the sample names, and it is assumed there are two pair-ended FASTQ reads per sample. It is also assumed that the paired-ended samples appeared in ordered fashoin in the file-listing: i.e. each consecutive set of two lines represent one sample.
De novo assembly
We use SPAdes with the --meta option here as we are dealing with metagenomes. First we need the python DRMAA script which will control the where and how the job script will be run:
#!/usr/bin/env python2.7 import os, sys, drmaa def main(): """Submit an array job.""" argquan=len(sys.argv) if argquan != 4: print "This script requires two arguments: 1) the script to run in ja mode 2) filelist of absolute paths and filenames 3) Number of threads/CPU for *each* job array" sys.exit(2) s = drmaa.Session() s.initialize() print 'Creating job template' jt = s.createJobTemplate() jt.workingDirectory=os.getcwd() # means sge job output will be deposited here. jt.remoteCommand = jt.workingDirectory + '/' +sys.argv[1] with open(sys.argv[2]) as x: fl = x.read().splitlines() eflsz=len(fl) PL=[] for i in xrange(eflsz): PL.append(fl[i]) pld2=len(PL)/2 jt.args =PL # prepare natSpec nm='-N jadrm0' jt.nativeSpecification='-V -pe multi ' +sys.argv[3]+ ' '+nm # this is an intensive IO job, don't want to whack the FS too much jt.joinFiles=True # # print eflsz jobid = s.runBulkJobs(jt, 1, pld2, 1) print 'Your job has been submitted with id ' + str(jobid) print 'Cleaning up' s.deleteJobTemplate(jt) s.exit() if __name__=='__main__': main()