-
Notifications
You must be signed in to change notification settings - Fork 0
/
Weekly_Stats_New
49 lines (41 loc) · 2.71 KB
/
Weekly_Stats_New
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env bash
#Run this script in a folder with one tsv file of DC sequences (downloaded from GISAID; download --> Sequencing Technology Metadata; rename files so they start with "gisaid" and end with ".tsv")
#Make specific files to be used later
awk -F"\t" '$1 ~ "DC-DFS-PHL"' gisaid*.tsv > PHL.tsv
awk -F"\t" '$1 ~ "hCoV-19"' gisaid*.tsv > DC.tsv
#Print the number of sequences for DC and DC-DFS-PHL
printf "# sequences from DC: "
grep -c "hCoV-19" DC.tsv
printf "# sequences from DFS-PHL: "
grep -c "DC-DFS-PHL" PHL.tsv
#Print the number of Omicron sequences (based on clade GRA)
printf "# Omicron sequences from DC (# clade GRA): "
grep -c "GRA" DC.tsv
printf "# Omicron sequences from DFS-PHL (# clade GRA): "
grep -c "GRA" PHL.tsv
#Make a spreadsheet of the number/percentage of variants for all labs for DC and DC-DFS-PHL
awk -F"\t" '{print $13}' DC.tsv | sort | uniq -c | awk '{s+=$1;lines=lines"\n"$0} END {printf "%d Total",s;print lines "\t"}' | awk '!max{max=$1}{s=$1/max*100;c=$1;$1="";printf "%-10s \t %10d \t %7.2f%%\n",$0,c,s;}' > DC_counts_and_percentages.tsv
awk -F"\t" '{print $13}' PHL.tsv | sort | uniq -c | awk '{s+=$1;lines=lines"\n"$0} END {printf "%d Total",s;print lines "\t"}' | awk '!max{max=$1}{s=$1/max*100;c=$1;$1="";printf "%-10s \t %10d \t %7.2f%%\n",$0,c,s;}' > PHL_counts_and_percentages.tsv
#Count sub-lineages under parent lineages
##Replace each period with a tab
awk -F"\t" '{print $14"\t"$13}' DC.tsv | sed 's/\./\t/g' | sort > DC_2.tsv
awk -F"\t" '{print $14"\t"$13}' PHL.tsv | sed 's/\./\t/g' | sort > PHL_2.tsv
##Count based on 2 columns, 3 columms, 4 columns, etc
awk -F"\t" '{print "\t"$1"\t"$2}' DC_2.tsv | sort | uniq -c >> DC_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3}' DC_2.tsv | sort | uniq -c >> DC_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3"."$4}' DC_2.tsv | sort | uniq -c >> DC_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3"."$4"."$5}' DC_2.tsv | sort | uniq -c >> DC_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3"."$4"."$5"."$6}' DC_2.tsv | sort | uniq -c >> DC_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2}' PHL_2.tsv | sort | uniq -c >> PHL_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3}' PHL_2.tsv | sort | uniq -c >> PHL_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3"."$4}' PHL_2.tsv | sort | uniq -c >> PHL_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3"."$4"."$5}' PHL_2.tsv | sort | uniq -c >> PHL_3.tsv
awk -F"\t" '{print "\t"$1"\t"$2"."$3"."$4"."$5"."$6}' PHL_2.tsv | sort | uniq -c >> PHL_3.tsv
##Delete all rows that end in a period
sed '/\.$/d' DC_3.tsv > DC_4.tsv
sed '/\.$/d' PHL_3.tsv > PHL_4.tsv
##Sort based on column 3 (lineage)
sort -k 3 DC_4.tsv > DC_sublineages_combined.tsv
sort -k 3 PHL_4.tsv > PHL_sublineages_combined.tsv
##Remove unneeded files
rm DC_2.tsv DC_3.tsv DC_4.tsv PHL_2.tsv PHL_3.tsv PHL_4.tsv