forked from Nealelab/UK_Biobank_GWAS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
17.create_finngen_pipelines.py
53 lines (39 loc) · 2.46 KB
/
17.create_finngen_pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import print_function
from hail import *
hc = HailContext()
kt = hc.import_table('gs://ukb31063-mega-gwas/curated-phenotypes/2018-04-06_ukb-finngen-pheno-for-analysis.tsv', key='eid', types={'eid': TString()}, impute=True)
kt = kt.rename({'eid': 's'})
kt_counts = hc.import_table('gs://ukb31063-mega-gwas/curated-phenotypes/2018-04-07_ukb-finngen-all-pheno-counts.tsv', key='NAME',
types={'Ncase': TInt(), 'Nctrl': TInt(), 'Prop': TDouble()}, missing='')
exclude = set(kt_counts.query('NAME.filter(x => isDefined(HD_ICD_10)).collect()'))
traits = [x for x in kt.columns if x != 's' and x not in exclude]
kts = {
'both_sexes': kt.join(hc.read_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.kt'), how='inner').cache(),
'female': kt.join(hc.read_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.females.kt'), how='inner').cache(),
'male': kt.join(hc.read_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.males.kt'), how='inner').cache()
}
queries = {
'both_sexes': kts['both_sexes'].query(['`{}`.filter(s => s != 0).count()'.format(x) for x in traits]),
'female': kts['female'].query(['`{}`.filter(s => s != 0).count()'.format(x) for x in traits]),
'male': kts['male'].query(['`{}`.filter(s => s != 0).count()'.format(x) for x in traits])
}
selections = {
'both_sexes': [x for i, x in enumerate(traits) if queries['both_sexes'][i] >= 50],
'female': [x for i, x in enumerate(traits) if queries['female'][i] >= 50],
'male': [x for i, x in enumerate(traits) if queries['male'][i] >= 50]
}
for k, table in kts.iteritems():
kt_filter = table.select(['s'] + selections[k])
cols = [x for x in kt_filter.columns if x != 's']
groups = [cols[i:(i+110)] for i in xrange(0, len(cols), 110)]
current_pipeline = 0
for g in groups:
print('Sex: {}'.format(k))
print('Pipeline: {:}'.format(current_pipeline))
print('Traits: ', g)
(kt_filter.select(['s'] + g)
.write('gs://ukb31063-mega-gwas/hail-0.1/phenotype-pipelines/finngen/ukb31063.{0}.finngen.pipeline.{1:}.kt'.format(k, current_pipeline), overwrite=True))
print('Sex: {}'.format(k))
print('Pipeline: {:,}'.format(current_pipeline))
print('nSamples: {:,}'.format(hc.read_table('gs://ukb31063-mega-gwas/hail-0.1/phenotype-pipelines/finngen/ukb31063.{0}.finngen.pipeline.{1:}.kt'.format(k, current_pipeline)).count()))
current_pipeline += 1