-
Notifications
You must be signed in to change notification settings - Fork 0
/
neutral_loci_finder_v2.sh
62 lines (45 loc) · 1.87 KB
/
neutral_loci_finder_v2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# !/usr/local/bin/bash
# 26 Oct 2018
# A. Vershinina
# Goal: file processing from non-neutral regions to the set of neutral loci.
#########################
# usage script.sh -b nonneutral_regions.bed -g reference_genome_info.txt -r region_length -s step_length -o outfilename
#########################
# ref_genome_info is a text file looking like
# chrom size
# chr1 185838109
# chrX 124114077
# chr2 120857687
# This file should be sorted the same way as bed file.
# -r length of the region to be extracted for the final loci set.
# -s step to be used to space regions by MIN(s). Let's say you want regions of 1kb spaced by ate least 30kb. The would be -s 30000 -r 1000
# Remember that bed_from_bed works only if step>region.
# bed_from_bed.py is a script github.com/avershinina/pleistohorses/interval_jobs/bed_from_bed.py
# bedtools requires data sorted by chromosome and then by start position
# sort -k1,1 -k2,2n in.bed > in.sorted.bed
# Assign arguments
while getopts b:g:r:s:o: option
do
case "${option}"
in
b) BED=${OPTARG};;
g) REF=${OPTARG};;
r) REGION=${OPTARG};;
s) STEP=$OPTARG;;
o) OUT=$OPTARG;;
esac
done
echo 'reading' $BED 'file'
HOME=$(pwd)
mkdir ${HOME}/${OUT}_outdir
OUT_DIR=${HOME}/${OUT}_outdir
# Sort for bedtools
sort -k1,1 -k2,2n $BED > ${OUT_DIR}/${BED}.sorted
sort -k1,1 -k2,2n $REF > ${OUT_DIR}/${REF}.sorted
# Make an inverse of nonneutral regions, i.e. create a guide bed file.
bedtools complement -i ${OUT_DIR}/${BED}.sorted -g ${OUT_DIR}/${REF}.sorted > ${OUT_DIR}/${BED}.complemented
# Get regions from the guide file
cd ${OUT_DIR}
# Make the guide file smaller removing regions that are too short to be used.
cat ${OUT_DIR}/${BED}.complemented | awk '{ $4 = $3 - $2 } 1' OFS='\t' | awk '$4 >= 1001 {print $1,$2, $3, $4}' OFS='\t' > ${OUT_DIR}/${BED}.long
python ${HOME}/bed_from_bed.py -f ${OUT_DIR}/${BED}.long -r $REGION -s $STEP -o ${OUT}.extracted_regions.bed