forked from lh3/biofast
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fqcnt_py2_rfq.py
executable file
·48 lines (46 loc) · 1.45 KB
/
fqcnt_py2_rfq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
def readfq(fp): # this is a generator function
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l.rstrip() # save this line
break
if not last: break
name, seq, last = last[1:].partition(" ")[0], "", None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l.rstrip()
break
seq += l.rstrip()
if not last or last[0] != '+': # this is a fasta record
yield name, seq, None # yield a fasta record
if not last: break
else: # this is a fastq record
qual, seq_len = "", len(seq)
for l in fp: # read the quality
qual += l.rstrip()
if len(qual) >= seq_len: # have read enough quality
last = None
yield name, seq, qual; # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
break
if __name__ == "__main__":
import sys, re, gzip
if len(sys.argv) == 1:
print("Usage: fqcnt.py <in.fq.gz>")
sys.exit(0)
fn = sys.argv[1]
if re.search(r'\.gz$', fn):
fp = gzip.open(fn, 'rt')
else:
fp = open(fn, 'r')
n, slen, qlen = 0, 0, 0
for name, seq, qual in readfq(fp):
n += 1
slen += len(seq)
qlen += qual and len(qual) or 0
print('{}\t{}\t{}'.format(n, slen, qlen))