-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
164 lines (130 loc) · 7.94 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
.PHONY: target/non-human-samples.tsv .FORCE smalltest biosample_set_basex biosample_table biosample_indices column-accounting
target download:
curl -L -s https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz > downloads/biosample_set.xml.gz
curl -L -s https://raw.githubusercontent.com/kbaseapps/kb_cmash/master/lib/kb_cmash/utils/data/ebi_samples_metadata_with_studies_final_with_cols.csv > downloads/ebi_samples_metadata_with_studies_final_with_cols.csv
downloads/emp.tsv:
curl -L -s ftp://ftp.microbio.me/emp/release1/mapping_files/emp_qiime_mapping_release1.tsv > $@
downloads/emp_studies.csv:
curl -L -s 'https://zenodo.org/record/890000/files/emp_studies.csv?download=1' > $@
target/emp_studies.tsv: downloads/emp_studies.csv
mlr --csv --otsv cat $< > $@
target/attributes.tsv:
gzip -dc downloads/biosample_set.xml.gz | ./util/hacky-scan.pl > $@
target/attribute-usage.tsv: target/attributes.tsv
egrep -v '\t(not determined|missing)' $< | cut -f1 | ./util/count-occ.pl | ./util/mysort -r -k1 -n > $@
target/envo-usage.tsv: target/attributes.tsv
grep '^env_' $< > $@
target/envo-usage-stats.tsv: target/envo-usage.tsv
cut -f2 $< | ./util/count-occ.pl > $@
target/harmonized-values-eav.tsv:
# creates a tsv with:
# columns: id|attribute|value
# for attribute tags, only ones with harmonized names are collected
# text values, such as paragagraph and taxonomy name, are also collected as attributes
gzip -dc downloads/biosample_set.xml.gz | ./util/harmonized-eav.pl > $@
target/harmonized-values-eav.tsv.gz: target/harmonized-values-eav.tsv
# gzips the target target/harmonized-values-eav.tsv
gzip -v -c $< > $@
target/harmonized-attributes-only-eav.tsv:
# creates a tsv with ONLY the attributes that have a harmonized name
# e.g., <Attribute attribute_name="estimated_size" harmonized_name="estimated_size">2550000</Attribute>
# columns: accession|attribute|value
gzip -dc downloads/biosample_set.xml.gz | ./util/harmonized-attributes-only-eav.pl > $@
target/harmonized-attributes-only-eav.tsv.gz: target/harmonized-attributes-only-eav.tsv
# gzips the target target/harmonized-attributes-only-eav.tsvj
gzip -v -c $< > $@
target/harmonized-attribute-value.ttl: target/harmonized-values-eav.tsv.gz
# convert harmonized-values-eav.tsv to rdf (turtle)
# the output triples are of form <:subj> <:pred> <:value>; so it is also in n3 format
# each attribute also has an rdfs label
gzip -dc $< | ./util/harmonized-eav-to-rdf.pl > $@
target/harmonized-attribute-value.ttl.gz: target/harmonized-attribute-value.ttl
# gzips target/harmonized-attribute-value.ttl
gzip -v -c $< > $@
target/harmonized-table.tsv: target/harmonized-values-eav.tsv
# pivots data in harmonized-values-eav.tsv in a tabular-columnar form
./util/harmonized-eav-pivot.pl $< > $@
target/harmonized-table.tsv.gz: target/harmonized-table.tsv
# gzips target/harmonized-table.tsv
gzip -v -c $< > $@
target/harmonized-table.parquet.gz: target/harmonized-table.tsv
# save target/harmonized-table.tsv as a parquet file
# this makes loading the data easier
python ./util/save-harmonized-table-to-parquet.py $< $@
target/harmonized_table.db: target/harmonized-table.tsv
# creates an sqlite3 database of target/harmonized-table.tsv
# NB: this operation takes a few hours to complete
python ./util/save-harmonized-table-to-sqlite.py $< $@
target/harmonized_table.db.gz: target/harmonized_table.db
# gzips target/harmonized_table.db.gz
gzip -v -c $< > $@
target/biosample-table.tsv: target/biosample-attribute-value.tsv
# converts target/biosample-attribute-value.tsv (EAV format) into a tabular column format
util/biosample-eav-pivot.pl $< > $@
############### WARNING: I RAN THIS ALL NIGHT AND IT DID NOT COMPLETE
# target/biosample-table.tsv.gz: target/biosample-table.tsv
# # gzips target/biosample-table.tsv
# gzip -v -c $< > $@
# target/biosample-table-reduced.tsv:
# # similar to biosample-table.pl, but only puts attribute values in the table
# util/biosample-table-reduced.pl downloads/biosample_set.xml > $@
##################
target/attribute-to-harmonized-lookup.tsv:
# create a lookup table matching the attribute names to harmonized names
gzip -dc downloads/biosample_set.xml.gz | ./util/attribute-to-harmonized-lookup.pl > $@
target/taxon-id-to-name-lookup.tsv:
# create a lookup table matching the taxonomy ids to taxonomy names
gzip -dc downloads/biosample_set.xml.gz | ./util/taxon-id-to-name-lookup.pl > $@
target/biosample-to-taxon-id.tsv:
# create a lookup table matching the biosample to its taxonomy id
gzip -dc downloads/biosample_set.xml.gz | ./util/biosample-to-taxon-id.pl > $@
target/occurrences-%.tsv: target/attributes.tsv
egrep '^$*\t' $< | cut -f2 > $@
.PRECIOUS: target/occurrences-%.tsv
target/distinct-%.tsv: target/occurrences-%.tsv
./util/count-occ.pl $< | ./util/mysort -r -k1 -n > $@
target/non-human-samples.tsv.gz: .FORCE
# executes the jupyter notebook src/notebooks/build-non-human-samples.ipynb
# in order to create the target/non-human-samples.tsv.gz file
# NB: target/harmonized-table.parquet.gz must exist locally
jupyter nbconvert --execute --clear-output src/notebooks/build-non-human-samples.ipynb
target/mixs-triad-counts.tsv: target/harmonized_table.db .FORCE
# creates file containing the number of times each mixs triad occurs
# NB: target/harmonized_table.db must exist locally
util/mixs-triad-counts.py -db $< -out $@
target/nmdc-biosample-one-hot.tsv: downloads/nmdc-gold-path-ner/runner/runNER_Output.tsv
# create a one-hot encoded representation of the named entities found by runNER
util/one-hot-encode-nmdc-biosample-net.py -i $< -o $@
target/nmdc-biosample-one-hot.tsv.gz: target/nmdc-biosample-one-hot.tsv
# gzip target/nmdc-biosample-one-hot.tsv
gzip -v -c $< > $@
#target/%MIxS_columns.tsv: https://github.com/cmungall/mixs-source/tree/main/src/schema
## This notebook generates two files : MIxS_columns.tsv and Non_MIxS_columns.tsv.
## Highlights the data column names that are MIxS terms and non-MIxS terms
# jupyter nbconvert --execute --clear-output src/notebooks/MIxS_comparison.ipynb
# depends on target/harmonized_table.db but is not triggering it here
column-accounting:
# install basex and load ftp://ftp.ncbi.nlm.nih.gov//biosample/biosample_set.xml.gz with default settings
# increasing basex's Java RAM allocation will improve performance
# probaly any xquery parser could run these queries
sqlite3 target/harmonized_table.db "pragma table_info(biosample)" | cut -f2 -d'|' | sort > target/biosample_sqlite_columns.txt
# ~ 3 minutes @ 24 GB RAM
# commenting out and commiting target/hn_count*
# assuming that most people don't have basex installed
# date ; basex xqueries/hn_count.xq > target/hn_count.txt ; date
cat target/hn_count.txt | cut -f1 | tail -n +2 | sort > target/hn_columns.txt
# 10 minutes @ 24 GB RAM
# commenting out as above
# date ; basex xqueries/non-attribute-minimal.xq > target/non-attribute-minimal.txt ; date
# head -n 1 target/non-attribute-minimal.txt | tr '\t' '\n' > target/non-attribute-minimal_columns.txt
cat target/hn_columns.txt target/non-attribute-minimal_columns.txt | sort | uniq > target/xquery_columns.txt
diff target/biosample_sqlite_columns.txt target/xquery_columns.txt| sort | egrep '^>|<' > target/column-accounting.txt
# > indicates columns that the xqueries obtain but which are not found in the SQLite
# possibly/mostly because the xqueries were run over a slightly newer biosample_set.xml which has additional attributes with harmonized names
# entrez_links from xquery uses a different (more inclusive?) strategy compared to the entrez* columns from SQLite
# < indicates columns that appear in the SQLite atrifact but are not obtained from xquery
# the SQLite entrez* columns contain some of the information in the xquery entrez_links column distrubted over three columns
# compare the entrez annotations for BIOSAMPLE:SAMN00004593 between the two sources
# xquery: bioproject:PRJNA141675:141675|bioproject:PRJNA138711:138711
# attribute from SQLite is all NULL
# this column name anlysis doent' not say anythign about column contents