-
Notifications
You must be signed in to change notification settings - Fork 0
/
rmsk2bed
executable file
·107 lines (92 loc) · 2.7 KB
/
rmsk2bed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
## Description: bash/awk script to convert RepeatMasker output into bed or gtf
## For details, run: ./rmsk2bed -h
# default params
outformat="bed"
TEfilter=0
# parse arguments
while getopts ":f:trh" opt; do
case $opt in
h)
echo 'Usage: zcat hg38.fa.out.gz | ./rmsk2bed [-f bed,gtf] [-th]
rmsk2bed version: 20210507
author: Benedetto Polimeni ([email protected])
Repeatmasker open-4.0.3-20130422 for hg38 asssembly can be downloaded from here:
http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.out.gz
-h Display this message and exit.
-f [bed,gtf] Output format. Default: bed.
-t Removes repeats classes that could interfere with RNA-Seq analysis:
Low_complexity, Simple_repeat, rRNA, scRNA, srpRNA, tRNA.
Default: False.
-r Only keep transpons and retrotransposons: DNA, LTR, LINE, SINE. Default: False.'
exit 0
;;
f)
outformat="$OPTARG"
;;
t)
TEfilter=1
;;
r)
TEfilter=2
;;
\?)
echo "Invalid option -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
esac
done
gawk -v OFS="\t" -v outformat=$outformat -v TEfilter=$TEfilter 'BEGIN {
# print bed header
if (outformat=="bed") {
print "#chr\tstart\tend\trepName\tSWscore\tstrand\trepSubfamily\trepFamily\trepClass\tmilliDiv\tmilliDel\tmilliIns\trepStart\trepEnd\trepLeft\trepInstance"
}
} NR>3 {
sw_score=$1
millidiv=$2
millidel=$3
milliins=$4
chr=$5
start=$6-1
end=$7
strand=($9=="+" ? "+" : "-")
subfam=$10
repstart=(strand=="+" ? $12 : $14)
if (repstart ~/\(/) {
# remove parenthesis from repeat starting coordinate in consensus
gsub(/[\(\)]/,"",repstart)
}
repstart-=1
repend=$13
repleft=(strand=="+" ? $14 : $12)
if (repleft ~/\(/) {
# remove parenthesis from consensus left bases
gsub(/[\(\)]/,"",repleft)
}
instance=$15
# get family and class from 11th column
if ($11~/\//) {
split($11,fam,"/")
class = fam[1]
family = fam[2]
} else {
class = $11
family = $11
}
if (TEfilter==1 && (class~/^(Low_complexity|Simple_repeat|rRNA|scRNA|srpRNA|tRNA)$/)) {
next
} else if (TEfilter==2 && !(class~/^(DNA|LTR|LINE|SINE)$/)) {
next
}
# assign unique ID to individual loci
sfarr[subfam]+=1
repname=subfam"_l"sfarr[subfam]
if (outformat=="bed") {
print chr,start,end,repname,sw_score,strand,subfam,family,class,millidiv,millidel,milliins,repstart,repend,repleft,instance
} else if (outformat=="gtf") {
print chr,"hg38_rmsk","exon",start+1,end,sw_score,strand,".","gene_id \""subfam"\"; transcript_id \""repname"\"; family_id \""family"\"; class_id \""class"\";"
}
}'