forked from wenmm/IsoSeqScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
s4_genCluster.py
306 lines (238 loc) · 11.9 KB
/
s4_genCluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#! /usr/bin/env python
import re
import os
import sys
import glob
import errno
import textwrap
import argparse
# Constants
STEP3 = 's3_classify'
STEP4 = 's4_cluster'
CELL_DESC = '5-8cell_grps'
FASTA_NAME_REGEXP = r'^([\w\-]+_grp\d+_\w+_\d+cell)_isoseq_\w+\.fasta'
def createFolder(fullDir):
"""Create subfolders for storing file of filenames and scripts"""
# catch any erorrs with permissions, ignore if dirs already exist
try:
# os.makedirs works like 'mkdir -p' by making all parent dirs too
os.makedirs(fullDir)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
# /lustre/groups/cbi/brianu/projects/isoseq_muscle/results/s3_classify/<polyA or no>/5-8cell_grps/3-6kb/<grpName>
def genInputGrpDirLists(ignorePolyA, resultsRoot, size):
"""We want to build the directories to the grouped classify results (which
will be where we find the input files),"""
if ignorePolyA:
dirPolyA = 'noPolyA'
else: # Default is to not add the flag (polyA included is more stringent/recommended)
dirPolyA = 'polyA'
# generate full path to parent folder (fraction size) for input/classify results
pathToInputGrpDirs = os.path.join(resultsRoot, STEP3, dirPolyA, CELL_DESC, size)
currInputListing = os.listdir(pathToInputGrpDirs) # list of all files/folders in dir
grpInputDirNames = [] # Will extract these dir names for naming files
grpInputDirPaths = [] # For full path to the directories
for item in currInputListing:
currInputDir = os.path.join(pathToInputGrpDirs, item)
if os.path.isdir(currInputDir): # filter for listing for directories only
grpInputDirNames.append(item)
grpInputDirPaths.append(currInputDir)
return grpInputDirNames, grpInputDirPaths
def genFastaDict(grpInputDirPaths, demultiplex):
"""Create a dictionary of fasta files for making the cluster script.
fastaDict = {nflFastaPath: [FlncFastaPathList]}"""
fastaPathDict = {} # dict keys will be nfl fasta : value will be flncFastas
for i, grpDir in enumerate(grpInputDirPaths):
flncFastaList = [] # List of flnc fasta input files (will be dict values)
# non-full-length fastas always in parent group folder
# There is only one per folder so will use its full path as key
nflExpr = os.path.join(grpDir, '*_isoseq_nfl.fasta')
nflGlob = sorted(glob.glob(nflExpr)) # can't risk files being out of order
nflKey = nflGlob[0] # glob throws stuff into lists even if there is only 1 match
if demultiplex: # use nested demultiplexed folder in path
dirFlnc = 'demultiplexed'
else: # Default is to not add the flag; 7-15kb are not pooled. So use grp folder
dirFlnc = ''
flncDir = os.path.join(grpDir, dirFlnc) # append 'demultiplexed' if true
flncExpr = os.path.join(flncDir, '*_isoseq_flnc.fasta')
flncGlobList = sorted(glob.glob(flncExpr))
flncFastaList = [] # make new list with appended dirFlnc
for path in flncGlobList:
flncFastaList.append(path)
fastaPathDict[nflKey] = flncFastaList
# Note: This dictionary will contain one nfl key and one flnc if the group
# was not demultiplexed and 3 if it was demultiplexed (for each tissue)
return fastaPathDict # dictionary of fullpath to input files for cluster script
def genNameStemDict(fastaPathDict):
"""The FLNC fasta file names will be the basis for the cluster output folder
names because it has the size, group, tissue/pooled, and number of cells in
the group. Will use for making output directories and file output names.
Note: Although a bit long and unwieldly the NFL fasta full path will remain
the key, which will be useful when creating the script and can use the same
key for all dictionaries"""
# r'^([\w\-]+_grp\d+_\w+_\d+cell)_isoseq_\w+\.fasta'
nameStemsDict = {}
for nflPath, flncPathList in fastaPathDict.iteritems():
flncNameStems = [] # house list of flnc Name stems for building new dict
for flncPath in flncPathList:
flncBaseName = os.path.basename(flncPath)
reResults = re.search(FASTA_NAME_REGEXP, flncBaseName)
flncNameStem = reResults.group(1)
flncNameStems.append(flncNameStem)
nameStemsDict[nflPath] = flncNameStems
return nameStemsDict
def genFofnDict(fastaPathDict, fofnRoot, size):
"""Create a dictionary of bax and ccs fofn fullpaths. It's rather complex, maybe
there is a simpler solution out there, but this is how it is organized:
{nflPath: {'ccs': ccs.fofn, 'bax': bax.fofn}}"""
fofnCcsDir = os.path.join(fofnRoot, STEP4, CELL_DESC, 'ccs', size)
fofnBaxDir = os.path.join(fofnRoot, STEP4, CELL_DESC, 'bax', size)
fofnDict = {}
for nflPath in fastaPathDict:
nflBaseName = os.path.basename(nflPath)
reResults = re.search(FASTA_NAME_REGEXP, nflBaseName)
nflNameStem = reResults.group(1)
baxFofn = nflNameStem + '_bax.fofn'
ccsFofn = nflNameStem + '_ccs.fofn'
fullBaxPath = os.path.join(fofnBaxDir, baxFofn)
fullCcsPath = os.path.join(fofnCcsDir, ccsFofn)
fofnDict[nflPath] = {'bax': fullBaxPath, 'ccs': fullCcsPath}
return fofnDict
def makeOutDirs(ignorePolyA, nameStemsDict, resultsRoot, logRoot, scriptsRoot, size):
"""Make output directories for cluster results, logs, and scritps"""
if ignorePolyA:
dirPolyA = 'noPolyA'
else: # Default is to not add the flag (polyA included is more stringent/recommended)
dirPolyA = 'polyA'
# make scripts folder
scriptsDir = os.path.join(scriptsRoot, STEP4, dirPolyA, CELL_DESC, size)
createFolder(scriptsDir)
# make log folder
logDir = os.path.join(logRoot, STEP4, dirPolyA, CELL_DESC, size)
createFolder(logDir)
# for cluster output
clusterOutDict = {} # new dict for cluster directories, also tied to nfl path key
clusterRoot = os.path.join(resultsRoot, STEP4, dirPolyA, CELL_DESC, size)
for nflPath, flncNameStemList in nameStemsDict.iteritems():
clusterOutList = []
for flncNameStem in flncNameStemList:
currClusterDir = os.path.join(clusterRoot, flncNameStem)
clusterOutList.append(currClusterDir)
createFolder(currClusterDir)
clusterOutDict[nflPath] = clusterOutList
return clusterOutDict, scriptsDir, logDir
def genClusterScripts(fastaPathDict, fofnDict, nameStemsDict, clusterOutDict, scriptsDir, logDir):
"""Generate the cluster submit scripts for colonial one."""
# Loop through fastaPath dictionary, grab {nflPath : [flncPathList]}
for nflPath, flncPathList in fastaPathDict.iteritems():
# loop through list of flncPaths
for i, flncPath in enumerate(flncPathList):
# Output directory for cluster results
clusterOutDir = clusterOutDict[nflPath][i]
# Generate the file names and fullpath for the output files
finalConsFasta = "{0}_final_consensus.fasta".format(nameStemsDict[nflPath][i])
pathToFinalConsFasta = os.path.join(clusterOutDir, finalConsFasta)
# unpack fofn paths
baxFofn = fofnDict[nflPath]['bax']
ccsFofn = fofnDict[nflPath]['ccs']
# Add the full path the log, and script
logOutFile = "{0}_log.out".format(nameStemsDict[nflPath][i])
logErrFile = "{0}_log.err".format(nameStemsDict[nflPath][i])
logOutPath = os.path.join(logDir, logOutFile)
logErrPath = os.path.join(logDir, logErrFile)
scriptName = '{0}_cluster.sh'.format(nameStemsDict[nflPath][i])
scriptFullPath = os.path.join(scriptsDir, scriptName)
output='''#!/bin/sh
# Cluster script for {0}
# Specify partition
#SBATCH -p 128gb
# Specify number of nodes
#SBATCH -N 1
# Specify number of cores (each node has 16 cores)
#SBATCH -n 16
# one hour timelimit:
#SBATCH --time 14-00:00:00
cd {1}
echo "STARTED pbtranscript.py cluster: $(date)" >> {7}
# Load SmrtAnalysis software
module load smrt_analysis/2.3.0
# Load SmrtAnalysis enviornmental variables and the smrtShell
source /c1/apps/smrt_analysis/2.3.0/current/etc/setup.sh
/c1/apps/smrt_analysis/2.3.0/smrtcmds/bin/smrtshell
pbtranscript.py cluster \
{2} \
{3} \
--nfl_fa {4} \
-d cluster_out \
--bas_fofn {5} \
--ccs_fofn {6} \
--cDNA_size above3k \
--quiver \
--blasr_nproc 15 \
--quiver_nproc 15 \
>> {7} 2>> {8}
echo "COMPLETED pbtranscript.py cluster : $(date)" >> {7}'''.format(nameStemsDict[nflPath][i],
clusterOutDir, flncPath, pathToFinalConsFasta, nflPath, baxFofn, ccsFofn, logOutPath, logErrPath)
with open(scriptFullPath,'wb') as scriptFile:
scriptFile.write(output)
def writeRunScript(scriptsDir, size):
# Get list of roi scripts
scriptPattern = "{0}_*.sh".format(size)
scriptList = sorted(glob.glob(os.path.join(scriptsDir, scriptPattern)))
# Create the run script
runScriptName = 'submitAll_{0}.sh'.format(size)
fullRunScriptPath = os.path.join(scriptsDir, runScriptName)
header = "#! /usr/bin/env bash\n\n"
with open(fullRunScriptPath, 'wb') as runScript:
runScript.write(header)
for script in scriptList:
runCmd = "sbatch {0}\n".format(script)
runScript.write(runCmd)
def parseInput():
"""Use argparse to handle user input for program"""
# Create a parser object
parser = argparse.ArgumentParser(
prog="s4_genCluster.py",
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""Python script to create submit scripts for cluster step.""")
# Parse the polyA or the noPolyA groups
parser.add_argument('--ignorePolyA', help="", action='store_true')
parser.add_argument('--demultiplex', help="", action='store_true')
parser.add_argument("-v", "--version", action="version",
version=textwrap.dedent("""\
%(prog)s
-----------------------
Version: 0.3
Updated: 08/29/2016
By: Prech Uapinyoying
Website: https://github.com/puapinyoying"""))
# Essential args
requiredNamed = parser.add_argument_group('required named arguments')
requiredNamed.add_argument('--resultsRoot', help="", required=True)
requiredNamed.add_argument('--dataRoot', help="", required=True)
requiredNamed.add_argument('--logRoot', help="", required=True)
requiredNamed.add_argument('--scriptsRoot', help="", required=True)
requiredNamed.add_argument('--fofnRoot', help="", required=True)
requiredNamed.add_argument('--size', help="", required=True)
args = parser.parse_args()
return args
args = parseInput()
# Generate lists of input directories and paths based on polyA and size fraction
grpInputDirNames, grpInputDirPaths = genInputGrpDirLists(args.ignorePolyA,
args.resultsRoot, args.size)
# Build a dictionary of all the nfl and flnc fasta files, special directory for
# demultiplexed files. Dictionary uses nfl.fasta path as key and flnc.fastas as values
fastaPathDict = genFastaDict(grpInputDirPaths, args.demultiplex)
# RegExp flnc.fasta name stems (grp, #cells, size ect) put into dict using nfl.fasta as the key
nameStemsDict = genNameStemDict(fastaPathDict)
# Make dict of ccs and bax fofn file paths using nfl.fasta path as key again
fofnDict = genFofnDict(fastaPathDict, args.fofnRoot, args.size)
# generate output directories for cluster, scripts and log files. The clusterOutDict
# has the same nfl.fasta key, with the values being cluster dirs
clusterOutDict, scriptsDir, logDir = makeOutDirs(args.ignorePolyA, nameStemsDict,
args.resultsRoot, args.logRoot, args.scriptsRoot, args.size)
# Now, generate the cluster scripts with all the created dicts and vars
genClusterScripts(fastaPathDict, fofnDict, nameStemsDict, clusterOutDict, scriptsDir, logDir)
# Finally, create a 'run all' script for the whole size fraction.
writeRunScript(scriptsDir, args.size)