-
Notifications
You must be signed in to change notification settings - Fork 1
/
CrossPeak_functions.R
1428 lines (1300 loc) · 75.7 KB
/
CrossPeak_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#Custom functions for working with GRanges objects
#should contain only the functions used in multi species pipeline version
#Adding back round 2 potentially species-specific peaks that succeeded in individual liftovers with their respective categories to the consensus set
addMultiCorrespondingSpeciesPeaks <- function(sp_con_categories,species_spec_peaks, max_peak_num, species_all_peaks, sp, other_species){
#Figure out which peaks passed reciprocal liftover and are consensus for this species
initial_species <- other_species[1]
peak_names_to_add <- sp_con_categories[[initial_species]]$name[sp_con_categories[[initial_species]]$pass==TRUE]
for (j in 1:length(other_species)) { #Find which peaks passed for all species
sp_other = other_species[[j]]
peak_names_to_add <- intersect(peak_names_to_add, sp_con_categories[[sp_other]]$name[sp_con_categories[[sp_other]]$pass==TRUE])
}
peakNumbers <- as.numeric(sub(".*_", "", peak_names_to_add)); orderIndex <- order(peakNumbers)
orig_names <- peak_names_to_add[orderIndex]
peaks_to_add <- vector(mode = "list", length = length(other_species)); names(peaks_to_add) = other_species
for (j in 1:length(other_species)) { #Pull out coordinates for peaks that will be added to consensus list
sp_other = other_species[[j]]
peaks_to_add[[sp_other]] <- sp_con_categories[[sp_other]][sp_con_categories[[sp_other]]$name %in% orig_names]
}
#Check that they are properly sorted - this is crucial since they will be renamed and must correspond across species
#all species should be in the same order because they were sorted by original peak name going in
isSorted1 = is_sorted_by_peak_name(species_spec_peaks);
isSorted2 = TRUE
for (sp_other in other_species){
if (!is_sorted_by_peak_name(peaks_to_add[[sp_other]])){isSorted2 = FALSE}
}
if (!(isSorted1 & isSorted2)){stop('At least one of the input granges is not sorted.')}
categories = '' #List of categories for sp lifted to each other species (in original order of al species)
for (sp_other in other_species){
categories = paste0(categories, ';', peaks_to_add[[sp_other]]$category)
}
new_names = paste0('Peak_',seq(from = max_peak_num, to = max_peak_num+length(orig_names)-1, by = 1))
sp_coords = species_spec_peaks[species_spec_peaks$name %in% orig_names]
sp_coords$orig_name = orig_names
sp_coords$name = new_names
names(sp_coords) = sp_coords$name
sp_coords$category = categories
sp_coords$category_orig = categories
sp_coords$pass = TRUE #just for housekeeping, automatically passes since peak originated in this species
species_all_peaks[[sp]] = combineGrs(species_all_peaks[[sp]], sp_coords)
for (sp_other in other_species){
sp_other_coords = peaks_to_add[[sp_other]]
sp_other_coords$orig_name = orig_names
sp_other_coords$name = new_names
names(sp_other_coords) = sp_other_coords$name
sp_other_coords$category = categories
species_all_peaks[[sp_other]] = combineGrs(species_all_peaks[[sp_other]], sp_other_coords)
}
return(species_all_peaks)
}
#Making species specific sets
addMultiSpeciesOnlyPeaks <- function(species_specific_categories, species_spec_peaks,max_peak_num, sp, other_species){
#Figure out which peaks are shared between categories in all other species
initial_species <- other_species[1]
peak_names_to_add <- species_specific_categories[[initial_species]]$name
for (j in 1:length(other_species)) { #Find which peaks are in this category for all species
sp_other = other_species[[j]]
peak_names_to_add <- intersect(peak_names_to_add, species_specific_categories[[sp_other]]$name)
}
peakNumbers <- as.numeric(sub(".*_", "", peak_names_to_add)); orderIndex <- order(peakNumbers)
orig_names <- peak_names_to_add[orderIndex]
peaks_to_add <- vector(mode = "list", length = length(other_species)); names(peaks_to_add) = other_species
for (j in 1:length(other_species)) { #Pull out coordinates for peaks that will be added to consensus list
sp_other = other_species[[j]]
peaks_to_add[[sp_other]] <- species_specific_categories[[sp_other]][species_specific_categories[[sp_other]]$name %in% orig_names]
}
categories = '' #List of categories for sp lifted to each other species (in original order)
for (sp_other in other_species){
categories = paste0(categories, ';', peaks_to_add[[sp_other]]$category)
}
new_names = paste0('Peak_',seq(from = max_peak_num, to = max_peak_num+length(orig_names)-1, by = 1))
sp_coords = species_spec_peaks[species_spec_peaks$name %in% orig_names]
sp_coords$orig_name = orig_names
sp_coords$name = new_names
names(sp_coords) = sp_coords$name
sp_coords$category = categories
species_only_peaks = sp_coords
return(species_only_peaks)
}
#Assigns peaks to annotation categories based on which category the peak overlaps with the most (with intergenic being the default category for all
#ranges not specifically annotated). Ensure that duplicate ranges are removed from annotations object before using this function. Annotations object
#should have a column category with values belonging to category_types
assign_peaks_to_annotation_categories <- function(peaks, annotations, peak_width, category_types){
# Find overlaps between peaks and annotations
overlaps <- suppressWarnings(findOverlaps(peaks, annotations, ignore.strand = TRUE))
# Extract indices of overlapping peaks and annotations
peak_hits <- queryHits(overlaps)
annotation_hits <- subjectHits(overlaps)
# Get overlapping ranges
overlapping_ranges <- pintersect(peaks[peak_hits], annotations[annotation_hits])
# Compute overlap widths
overlap_widths <- as.numeric(width(overlapping_ranges))
# Extract categories as character vector
categories <- as.character(mcols(annotations)$category)
overlap_categories <- categories[annotation_hits]
# Create data frame with standard column types
df <- data.frame(
peak_idx = as.integer(peak_hits),
category = overlap_categories,
overlap_width = overlap_widths,
stringsAsFactors = FALSE
)
# Sum overlap widths within each category for each peak
df_sum <- df %>%
group_by(peak_idx, category) %>%
summarise(total_overlap = sum(overlap_width), .groups = 'drop') %>%
mutate(
peak_idx = as.integer(peak_idx),
category = as.character(category),
total_overlap = as.numeric(total_overlap)
)
# Calculate total annotated overlap for each peak
total_annotated_overlap <- df_sum %>%
group_by(peak_idx) %>%
summarise(total_annotated_overlap = sum(total_overlap), .groups = 'drop')
# Compute intergenic overlap
all_peak_idxs <- seq_along(peaks)
df_total_overlap <- data.frame(peak_idx = as.integer(all_peak_idxs), stringsAsFactors = FALSE) %>%
left_join(total_annotated_overlap, by = 'peak_idx') %>%
mutate(
total_annotated_overlap = ifelse(is.na(total_annotated_overlap), 0, total_annotated_overlap),
intergenic_overlap = peak_width - total_annotated_overlap
)
# Include intergenic category
df_intergenic <- df_total_overlap %>%
mutate(
category = 'intergenic',
total_overlap = intergenic_overlap
) %>%
dplyr::select(peak_idx, category, total_overlap)
# Ensure consistent column types before combining
df_sum <- df_sum %>%
mutate(
peak_idx = as.integer(peak_idx),
category = as.character(category),
total_overlap = as.numeric(total_overlap)
)
df_intergenic <- df_intergenic %>%
mutate(
peak_idx = as.integer(peak_idx),
category = as.character(category),
total_overlap = as.numeric(total_overlap)
)
# Combine all categories
df_all_categories <- bind_rows(df_sum, df_intergenic)
# For each peak, find the category with the maximum overlap
df_max_overlap <- df_all_categories %>%
group_by(peak_idx) %>%
dplyr::filter(total_overlap == max(total_overlap)) %>%
summarise(
category = first(category),
total_overlap = first(total_overlap),
.groups = 'drop'
)
# Apply overlap threshold
df_max_overlap <- df_max_overlap %>%
mutate(
category = ifelse(total_overlap > peak_width/2, category, 'intergenic')
)
# Summarize counts
category_counts <- df_max_overlap %>%
group_by(category) %>%
summarise(count = n(), .groups = 'drop') %>%
right_join(
data.frame(category = category_types, stringsAsFactors = FALSE),
by = 'category'
) %>%
mutate(count = ifelse(is.na(count), 0, count))
if (sum(category_counts$count) != length(peaks)){stop('Not all peaks were annotated. Check that annotations object includes category column.')}
return(category_counts)
}
#Adds unique peaks for each species after the first two to theconsensus list and excludes peaks that overlap more than tolerance
addUniquePeaks <- function(existingPeaks, newPeaks, overlap_tol, next_species) {
# Find overlaps with existing peaks
overlaps = findOverlaps(newPeaks, existingPeaks, minoverlap = overlap_tol, ignore.strand = TRUE)
# Identify non-overlapping new peaks
nonOverlappingIndices = setdiff(seq_along(newPeaks), queryHits(overlaps))
nonOverlappingNewPeaks = newPeaks[nonOverlappingIndices]
nonOverlappingNewPeaks$orig_name = nonOverlappingNewPeaks$name
nonOverlappingNewPeaks$orig_species = next_species
# Combine existing peaks with non-overlapping new peaks
updatedPeaks = c(existingPeaks, nonOverlappingNewPeaks)
return(updatedPeaks)
}
#Adds metadata about liftover failures
categorizeConsensusPeaksFailures <- function(consensus_peaks,peaks_list, sp, failure_code){
if (length(peaks_list) > 0) {
indices <- match(peaks_list, names(consensus_peaks))
indices <- na.omit(indices)
# Update the liftback_result with vectorized operations
consensus_peaks$liftback_result[indices] <- ifelse(
nzchar(consensus_peaks$liftback_result[indices]),
paste0(consensus_peaks$liftback_result[indices], ';', failure_code),
failure_code
)
# Update the liftback_species similarly
consensus_peaks$liftback_species[indices] <- ifelse(
nzchar(consensus_peaks$liftback_species[indices]),
paste0(consensus_peaks$liftback_species[indices], ';', sp),
sp
)
} else {
warning(paste0('No ', failure_code, ' peaks for species ', sp))
}
return(consensus_peaks)
}
#Checks whether round 2 peaks lift back reciprocally to peak of origin with different tolerances depending on category
#Also changes category if liftover succeeds but indel tolerance is higher than it should be for that category
checkReciprocalLiftovers <- function(spec_peaks,reciprocal_peaks,con_categories, indel_values, summit_half_width, sp_other, genome_names){
#Setup
tolerance_values = (indel_values + 2 * summit_half_width) / 2 - summit_half_width #How much a summit would move under each indel condition
get_new_category <- function(pass_current, pass_any, category, summit_diff) {
if (pass_current) {
return(category)
} else if (!pass_current && pass_any) {
# Explicitly check the current category and determine the next category based on summit_diff
if (category == "round2_indel_low" && summit_diff < tolerance_values["round2_indel_med"]) {
return("round2_indel_med")
} else if (category == "round2_indel_low" && summit_diff < tolerance_values["round2_indel_high"]) {
return("round2_indel_high")
} else if (category == "round2_indel_med" && summit_diff < tolerance_values["round2_indel_high"]) {
return("round2_indel_high")
}
}
return(category) # Default to the original category if no transitions apply
}
spec_peaks_df <- as.data.frame(spec_peaks)
reciprocal_peaks_df <- as.data.frame(reciprocal_peaks)
con_categories_df <- as.data.frame(con_categories)
#Find summits
spec_peaks_df <- spec_peaks_df %>%
dplyr::mutate(summit = (start + end) / 2)
reciprocal_peaks_df <- reciprocal_peaks_df %>%
dplyr::mutate(summit = (start + end) / 2)
#Join orig and reciprocal data frames and add info
peaks_check <- inner_join(spec_peaks_df,reciprocal_peaks_df, by = 'name')
cat_indices <- match(peaks_check$name,con_categories_df$name)
peaks_check$category = con_categories_df$category[cat_indices]
peaks_check <- peaks_check %>%
dplyr::mutate(seqnames.x = as.character(seqnames.x),
seqnames.y = as.character(seqnames.y)) %>%
dplyr::mutate(tolerance = (indel_values[category] + 2 * summit_half_width) / 2 - summit_half_width) %>%
dplyr::mutate(summit_diff = abs(summit.x - summit.y)) %>%
dplyr::mutate(pass_current = ifelse(seqnames.x == seqnames.y & summit_diff < tolerance, TRUE, FALSE)) %>%
dplyr::mutate(pass_any = ifelse(seqnames.x == seqnames.y & summit_diff < max(tolerance), TRUE, FALSE)) %>%
dplyr::mutate(category_new = mapply(get_new_category, pass_current, pass_any, category, summit_diff))
#Add info back to consensus_categories
match_indices = match(con_categories_df$name,peaks_check$name)
con_categories_df$category_new = peaks_check$category_new[match_indices]
con_categories_df$pass = peaks_check$pass_any[match_indices]
con_categories_df$pass[is.na(con_categories_df$pass)] = FALSE #set false for peaks that didn't match (ie failed or multiple reciprocal lift)
con_categories_df$summit_diff = peaks_check$summit_diff[match_indices]
con_categories_new <- GRanges(
seqnames = con_categories_df$seqnames,
ranges = IRanges(start = con_categories_df$start, end = con_categories_df$end),
strand = con_categories_df$strand,
name = con_categories_df$name,
category_orig = con_categories_df$category,
category = con_categories_df$category_new,
pass = con_categories_df$pass,
summit_diff = con_categories_df$summit_diff
)
genome(con_categories_new) = genome_names[[sp_other]]
return(con_categories_new)
}
#Combine granges objects, checking whether genomes are the same
#(default behavior only throws error if they are different but not if one is
#unspecified whereas here this function throws an error in both of these cases)
combineGrs <- function(...){
grs_list <- list(...)
# Extract genomes from each GenomicRanges object and flatten them into a single vector
genomes_vector <- unlist(lapply(grs_list, function(gr) unique(genome(gr))))
# Check if all genomic sequences refer to the same genome
if (length(unique(genomes_vector)) != 1) {
stop('Genomic ranges objects cannot be combined because not all refer to the same genome.')
} else {
# Use do.call with c to combine all GenomicRanges objects in the list
result <- do.call(c, grs_list)
return(result)
}
}
#Combines all regions with the same name on the same chromosome into a single region and returns granges with
#combined regions and data frame with the names of the regions that failed due to mapping to regions on different chromosomes,
#Input must be a granges object with a name column
combineLiftoverFragments <- function(gr) {
# Convert GenomicRanges to a data frame for easier manipulation
gr_df <- as.data.frame(gr)
# Group by 'name' and summarize each group
combined <- gr_df %>%
group_by(.data$name) %>%
summarize(
seqnames = if(n_distinct(.data$seqnames) > 1) NA_character_ else .data$seqnames[[1]],
start = min(.data$start),
end = max(.data$end),
strand = if(n_distinct(.data$strand) > 1) NA_character_ else .data$strand[[1]],
.groups = 'drop'
)
# Split the combined data frame into successes and failures - failure is when peaks of the same name mapped to different chromosomes and/or strands
successes <- combined %>% dplyr::filter(!is.na(.data$seqnames) & !is.na(.data$strand))
failures <- combined %>% dplyr::filter(is.na(.data$seqnames) | is.na(.data$strand))
# Convert successes back to GRanges
peaks_primate_lo_combined <- makeGRangesFromDataFrame(successes, keep.extra.columns = TRUE)
peaks_primate_lo_combined_sorted <- sortGrByName(peaks_primate_lo_combined)
# Return combined GRanges and failures
return(list(peaks_primate_lo_combined_sorted, failures))
}
#Check if final peaks are in the same place as original peaks (after lifting to ancestral and back), considering movement of the summit for merged peaks
compareMultiConsensusAndOriginalSummits <- function(orig_summits, merge_metadata, species_con_summits, indel_tol, summit_half_width, prefix) {
# Ensure the data is in the correct format for dplyr operations
merge_metadata <- as.data.frame(merge_metadata)
orig_summits <- as.data.frame(orig_summits)
species_con_summits <- as.data.frame(species_con_summits)
#Extract only peaks for species of interest
species_con_summits <- species_con_summits %>%
mutate(peak_of_interest = str_extract(.data$orig_name, paste0(prefix, "_peak_\\d+")))
filtered_species_con_summits <- species_con_summits %>%
filter(!is.na(.data$peak_of_interest))
names(filtered_species_con_summits)[names(filtered_species_con_summits) == "peak_of_interest"] <- paste0(prefix, "_peak_of_origin")
# Create a dataframe from merge_metadata
df <- merge_metadata
orig_summits_adjusted <- orig_summits %>%
dplyr::rename(original_name = .data$name)
# Perform the join
joined_summits <- df %>%
left_join(orig_summits_adjusted, by = c("original_peak_name" = "original_name")) %>%
mutate(
# Calculate the original summit adjustment
original_summit_adj = ifelse(!is.na(.data$start), as.numeric(.data$start) + summit_half_width + as.numeric(.data$summit_adj), NA),
# Calculate the consensus summit position
con_summit_summit = species_con_summits$start[match(.data$consensus_name, species_con_summits$name)] + summit_half_width,
# Calculate the difference between the consensus summit and the original summit adjustment
summit_diffs = .data$con_summit_summit - .data$original_summit_adj,
# Calculate a dynamic tolerance based on the adjustment
current_tols = ifelse(abs(as.numeric(.data$summit_adj))/10 < 1, indel_tol, round(abs(as.numeric(.data$summit_adj)/10) * indel_tol))
)
problem_summits <- joined_summits %>%
select(.data$consensus_name, .data$original_peak_name,.data$summit_adj, .data$summit_diffs, .data$current_tols)
# Filtering problem summits
problem_summits_final <- problem_summits %>%
filter(!is.na(.data$summit_diffs), !is.na(.data$current_tols), abs(.data$summit_diffs) > .data$current_tols)
print('Problem summits identified.')
return(problem_summits_final)
}
#Concatenates liftover results of the same names and prints a summary
concatenateAndSummarizeLiftovers <- function(human_summits, filename, summit_half_width, indel_tol){
lo <- import(paste0(filename,'_lo.bed'))
result <- combineLiftoverFragments(lo)
human_peaks_lo_all = result[[1]]; human_peaks_lo_multiple = result[[2]]
names(human_peaks_lo_all) = human_peaks_lo_all$name
summary_result = summarizeLiftoverResults(human_summits,human_peaks_lo_all,human_peaks_lo_multiple, summit_half_width)
summary = summary_result$summary
saveRDS(summary, paste0(filename,'_summary.rds'))
human_peaks_lo <- excludeDifferentWidthsSummits(human_peaks_lo_all, indel_tol, summit_half_width)
names(human_peaks_lo) = human_peaks_lo$name
return(list(all_lifted_peaks = human_peaks_lo_all,lifted_peaks_exclude_with_indel_tol = human_peaks_lo,
failed_lifted_peaks = human_peaks_lo_multiple, summary_result = summary_result))
}
#Excludes peaks overlapping blacklist region from a single species peak set
excludeBlacklistPeaks <- function(peaks,current_blacklist){
blacklist_peaks = findSingleOverlaps(peaks,current_blacklist)
peaks = peaks[!peaks$name %in% blacklist_peaks$name,]
return(peaks)
}
#Separates out summits that had a different width than the original after liftover (they can be included later depending on indel_tol)
excludeDifferentWidthsSummits <- function(gr, indel_tol, half_width){
widths = width(gr)
valid_indices <- which(widths >= 2*half_width+1 - indel_tol & widths <= 2*half_width+1 +indel_tol)
new_gr = gr[valid_indices,]
seqlevels(new_gr) <- sort(seqlevels(new_gr))
new_gr <- sort(new_gr)
}
#Add columns to consensus_peaks describing peaks that failed to lift back to individual species genomes and exclude these peaks from all species consensus peaks
excludeFailedProblemBlacklistPeaks<- function(consensus_peaks,con_peaks,problem_peaks, blacklist, species, remove_non_standard_chr, chr_to_remove){
#List the names of peaks that did not lift back successfully for each category and note in consensus_peaks
failed_peaks_list <- vector(mode = 'list', length = length(species)); names(failed_peaks_list) = species
problem_peaks_list <- vector(mode = 'list', length = length(species)); names(problem_peaks_list) = species
blacklist_peaks_list <- vector(mode = 'list', length = length(species)); names(blacklist_peaks_list) = species
chr_remove_peaks_list <- vector(mode = 'list', length = length(species)); names(chr_remove_peaks_list) = species
consensus_peaks$liftback_result = character(length(consensus_peaks))
consensus_peaks$liftback_species = character(length(consensus_peaks))
con_peaks2 = con_peaks
for (sp in species){
matched_indices <- match(mcols(consensus_peaks)$name, mcols(con_peaks[[sp]])$name)
#Failed peaks
has_no_match <- is.na(matched_indices)
failed_peaks_names = names(consensus_peaks)[has_no_match]
failed_peaks_list[[sp]] = failed_peaks_names
consensus_peaks <- categorizeConsensusPeaksFailures(consensus_peaks,failed_peaks_list[[sp]], sp, 'Failed')
#Problem peaks
problem_peaks_list[[sp]] = problem_peaks[[sp]]$consensus_name
consensus_peaks <- categorizeConsensusPeaksFailures(consensus_peaks,problem_peaks_list[[sp]], sp, 'Problem')
#Blacklist peaks
result = findSingleOverlaps(con_peaks[[sp]],blacklist[[sp]])
blacklist_peaks_list[[sp]] = result$name
consensus_peaks <- categorizeConsensusPeaksFailures(consensus_peaks,blacklist_peaks_list[[sp]], sp, 'Blacklist')
#Peaks on non-standard and excluded chromosomes
if (remove_non_standard_chr){
non_std_chr = findNonStandardChromosomes(con_peaks2[[sp]])
con_peaks2[[sp]] = dropSeqlevels(con_peaks2[[sp]],non_std_chr, pruning.mode = 'coarse')
}
con_peaks2[[sp]] = dropSeqlevels(con_peaks2[[sp]],chr_to_remove, pruning.mode = 'coarse')
peaks_removed = setdiff(con_peaks[[sp]]$name,con_peaks2[[sp]]$name)
chr_remove_peaks_list[[sp]] = peaks_removed
consensus_peaks <- categorizeConsensusPeaksFailures(consensus_peaks,chr_remove_peaks_list[[sp]], sp, 'Chr_removed')
}
failed_peaks = unique(unlist(failed_peaks_list))
problem_peaks = unique(unlist(problem_peaks_list))
blacklist_peaks = unique(unlist(blacklist_peaks_list))
chr_remove_peaks = unique(unlist(chr_remove_peaks_list))
#Exclude peaks from all species
exclude_peaks = c(failed_peaks,problem_peaks,blacklist_peaks, chr_remove_peaks)
con_peaks_final <- con_peaks
for (sp in species){
con_peaks_final[[sp]] = con_peaks[[sp]][!con_peaks[[sp]]$name %in% exclude_peaks,]
}
print('Failed, problem, and blacklisted peaks removed from all species consensus peaks.')
if (remove_non_standard_chr){'Peaks on non-standard chromosomes removed from all species consensus peaks.'}
if (is.null(chr_to_remove)){'Peaks on chr_to_remove removed from all species consensus peaks.'}
result = list(consensus_peaks,con_peaks_final)
}
#Exclude round 2 peaks that overlap more than tolerance with round 1 peaks as well as blacklisted or chr_to_remove peaks
excludeOverlappingAndBlacklistPeaks <- function(species_all_peaks1, species_con_peaks,
blacklist, species, remove_non_standard_chr, chr_to_remove, overlap_tol){
overlapping_peak_names <- vector(mode = 'list', length = length(species)); names(overlapping_peak_names) <- species
blacklist_peak_names <- vector(mode = 'list', length = length(species)); names(blacklist_peak_names) <- species
chr_to_remove_peak_names <- vector(mode = 'list', length = length(species)); names(chr_to_remove_peak_names) <- species
species_all_peaks_test = species_all_peaks1
for (sp in species){
#Overlapping peaks - exclude round2 overlapping round1
new_peaks = setDiffForPeakNames(species_all_peaks1[[sp]],species_con_peaks[[sp]])
overlapping_peak_names[[sp]] = findPeaksThatOverlapTooMuch(new_peaks, species_con_peaks[[sp]], overlap_tol)
#Blacklist peaks
result = findSingleOverlaps(species_all_peaks1[[sp]],blacklist[[sp]])
blacklist_peak_names[[sp]] = result$name
#Peaks on non-standard and excluded chromosomes
if (remove_non_standard_chr){
non_std_chr = findNonStandardChromosomes(species_all_peaks_test[[sp]])
species_all_peaks_test[[sp]] = dropSeqlevels(species_all_peaks_test[[sp]],non_std_chr, pruning.mode = 'coarse')
}
species_all_peaks_test[[sp]] = dropSeqlevels(species_all_peaks_test[[sp]],chr_to_remove, pruning.mode = 'coarse')
peaks_removed = setdiff(species_all_peaks1[[sp]]$name,species_all_peaks_test[[sp]]$name)
chr_to_remove_peak_names[[sp]] = peaks_removed
}
#Exclude peaks
all_peaks_to_remove = c(unique(unlist(overlapping_peak_names)),unique(unlist(blacklist_peak_names)), unique(unlist(chr_to_remove_peak_names)))
species_all_peaks2 = species_all_peaks1
for (sp in species){
species_all_peaks2[[sp]] = species_all_peaks2[[sp]][!(species_all_peaks2[[sp]]$name %in% all_peaks_to_remove)]
}
return(species_all_peaks2)
}
#Excludes peaks that overlap within the same granges object (e.g. after liftover) by more than tol with option to exclude all or keep the first instance
excludeSelfOverlaps <- function(gr, tol, ignore_strand, keep = 'none') {
# Find overlaps within the same GRanges object
overlaps <- findOverlaps(gr, gr, minoverlap = tol, ignore.strand = ignore_strand)
valid_overlaps <- overlaps[queryHits(overlaps) != subjectHits(overlaps)]
# Get indices
overlapping_indices <- unique(c(queryHits(valid_overlaps), subjectHits(valid_overlaps)))
non_overlapping_indices <- setdiff(seq_along(gr), subjectHits(valid_overlaps))
# Subset the GRanges object to keep only non-overlapping ranges
if (keep == 'none') {
gr_filtered <- gr[non_overlapping_indices]
}
if (keep == 'first') {
# Create a data frame of overlapping pairs
overlap_pairs <- data.frame(
query = queryHits(valid_overlaps),
subject = subjectHits(valid_overlaps)
)
# Ensure the first instance is the one with the smaller start coordinate
first_instance <- overlap_pairs %>%
mutate(min_start = pmin(start(gr[query]), start(gr[subject])),
pair_id = pmin(query, subject) * 1e10 + pmax(query, subject)) %>%
group_by(pair_id) %>%
slice_min(min_start, with_ties = FALSE) %>%
pull(query)
# Combine first instance indices with non-overlapping indices
gr_filtered <- gr[sort(unique(c(first_instance, non_overlapping_indices)))]
}
num_removed = length(gr) - length(gr_filtered)
print(paste0(num_removed, ' peaks were excluded because they overlapped by more than ', tol, ' bp.'))
return(gr_filtered)
}
#Makes a new granges object that expands a granges object by the defined amount
expandRange = function(x, upstream, downstream){
strand_is_minus = strand(x) == "-"
on_plus = which(!strand_is_minus)
on_minus = which(strand_is_minus)
start(x)[on_plus] = start(x)[on_plus] - upstream
start(x)[on_minus] = start(x)[on_minus] - downstream
end(x)[on_plus] = end(x)[on_plus] + downstream
end(x)[on_minus] = end(x)[on_minus] + upstream
x }
#Filters a granges object to retain only peaks with only user-selected categories. User must select which species categories should be used for filtering
gr_filter <- filter_for_high_confidence <- function(gr, categories, species_index){
#Split category column by semiocolon
get_entry_by_index <- function(category, col_index) {
entries <- unlist(strsplit(category, ";"))
# Remove empty entries and select the one at the desired index
entries <- entries[entries != ""]
if (length(entries) >= col_index) {
return(entries[col_index])
} else {
return(NA) # Return NA if the index is out of bounds
}
}
col_index = species_index - 1
d1 = as.data.frame(gr)
d2 = d1 %>% # Apply the get_entry_by_index function to extract the desired category for each row
mutate(sp_category = sapply(category, get_entry_by_index, col_index = col_index))
# Filter based on user-selected categories
d3 = d2[d2$sp_category %in% categories,]
# Convert back to GRanges object
gr_filter = makeGRangesFromDataFrame(d3, keep.extra.columns = TRUE)
return(gr_filter)
}
#Finds non-standard chromosomes in primate genomes - may need to modify for other species
findNonStandardChromosomes <- function(gr) {
# Extract the sequence names from the GRanges object
seq_names <- seqnames(gr)
# Define a pattern that matches non-standard chromosomes
# This pattern matches strings that do not start with 'chr'
# or have an underscore followed by additional characters after 'chrN'
pattern <- "^((?!chr).+|chr[0-9XY]+_.+|chrUn.+)$"
# Find and return sequence names that match the pattern
non_standard_chrs <- seq_names[grepl(pattern, seq_names, perl = TRUE)]
non_standard_chrs = as.character(unique(non_standard_chrs))
return(non_standard_chrs)
}
#Finds all overlaps between gr1 and gr2 and returns lengths
findOverlapLengths <- function(gr1,gr2, ignore_strand, min_overlap = 0){
commonSeqlevels <- intersect(seqlevels(gr1), seqlevels(gr2))
gr1 <- gr1[seqnames(gr1) %in% commonSeqlevels]
gr2 <- gr2[seqnames(gr2) %in% commonSeqlevels]
overlaps <- findOverlaps(gr1, gr2, ignore.strand = ignore_strand, minoverlap = min_overlap)
overlapLengths <- pmin(end(gr1[queryHits(overlaps)]), end(gr2[subjectHits(overlaps)])) -
pmax(start(gr1[queryHits(overlaps)]), start(gr2[subjectHits(overlaps)])) + 1
return(overlapLengths)
}
#Prints a summary of overlaps between two granges objects and returns the overlaps
findOverlapDetails <- function(gr1, gr2, ignore_strand,species_names, print_result = TRUE) {
commonSeqlevels <- intersect(seqlevels(gr1), seqlevels(gr2))
#gr1 <- gr1[seqnames(gr1) %in% commonSeqlevels]
#gr2 <- gr2[seqnames(gr2) %in% commonSeqlevels]
overlaps <- findOverlaps(gr1, gr2, ignore.strand = ignore_strand)
overlapLengths <- pmin(end(gr1[queryHits(overlaps)]), end(gr2[subjectHits(overlaps)])) -
pmax(start(gr1[queryHits(overlaps)]), start(gr2[subjectHits(overlaps)])) + 1
# Count of unique overlapping ranges in gr1 and gr2
numOverlaps <- list(length(unique(queryHits(overlaps))),length(unique(subjectHits(overlaps))))
# Count of non-overlapping ranges in gr1 and gr2
numNonOverlapsGr1 <- length(gr1) - numOverlaps[[1]]
numNonOverlapsGr2 <- length(gr2) - numOverlaps[[2]]
numNonOverlaps <- list(numNonOverlapsGr1,numNonOverlapsGr2)
labels = c('Non-overlapping peaks','Peaks with overlaps','Total')
df = data.frame(labels, species1 = c(c(numNonOverlaps[[1]],numOverlaps[[1]],length(gr1))),
species2 = c(numNonOverlaps[[2]],numOverlaps[[2]],length(gr2)))
names(df)[2:3] <- species_names
if(print_result){print(df)}
return(list(overlapLengths = overlapLengths,
numOverlaps = numOverlaps,
numNonOverlaps = numNonOverlaps))
}
#Finds peaks in gr1 that overlap more than tolerance with those in gr2 and return list of their names
findPeaksThatOverlapTooMuch <- function(gr1,gr2, overlap_tol){
overlaps <- findOverlaps(gr1, gr2, ignore.strand = TRUE)
overlapLengths <- findOverlapLengths(gr1, gr2, TRUE)
# Create a data frame to map overlaps
overlapMap <- data.frame(gr1Index = queryHits(overlaps),
gr2Index = subjectHits(overlaps),
overlapLen = overlapLengths)
# Filter overlaps based on the overlap tolerance
validOverlaps <- overlapMap[overlapMap$overlapLen >= overlap_tol, ]
overlap_names = gr1$name[validOverlaps$gr1Index]
return(overlap_names)
}
#Find overlaps from one gr (ex. peaks) to another (ex. a blacklist) and return a gr containing the rows of the overlapping peaks from the first list
findSingleOverlaps <- function(gr1, gr2) {
overlaps <- findOverlaps(gr1, gr2)
overlap_regions = gr1[queryHits(overlaps)]
return(overlap_regions)
}
#Takes two granges objects with a names column, finds which regions in each overlap
#2 or more regions in the #other and then outputs two data frames showing the
#correspondences
formatOverlaps <- function(query, subject, overlaps) {
queryNames <- mcols(query)$name[queryHits(overlaps)]
subjectNames <- mcols(subject)$name[subjectHits(overlaps)]
overlapsList <- split(subjectNames, queryNames)
# Filter out entries with less than two overlaps
overlapsList <- overlapsList[sapply(overlapsList, length) >= 2]
overlapsList <- lapply(overlapsList, function(x) paste(x, collapse=", "))
overlapsList
}
#For a list of peak names in one species, gets the cooresponding coordinates from species_all_peaks and species_only_peaks
getCoordsForPeakNames <- function(peak_names,sp,species_all_peaks, species_only_peaks){
all_peaks = combineGrs(species_all_peaks[[sp]], species_only_peaks[[sp]])
available_peaks = names(all_peaks)
peak_names = gsub("-", "_", peak_names);
valid_peak_names = peak_names[peak_names %in% available_peaks]
coords = all_peaks[valid_peak_names]
return(coords)
}
#Given two granges objects with names columns describing matching regions, return a data frame with the widths of the
#matching regions (missing regions will be discarded)
getMatchingRangeWidths <- function(gr1,gr2){
widths_human <- width(gr1)
if (is.null(gr1$name)){stop('Missing name column')}
names(widths_human) <- gr1$name
widths_primate_lo <- width(gr2)
if (is.null(gr2$name)){stop('Missing name column')}
names(widths_primate_lo) <- gr2$name
matched_ids <- intersect(gr1$name, gr2$name)
matched_widths_human <- widths_human[matched_ids]
matched_widths_primate_lo <- widths_primate_lo[matched_ids]
df <- data.frame(
WidthsGr1 = matched_widths_human,
WidthsGr2 = matched_widths_primate_lo
)
return(df)
}
#Gets putative species-specific peaks (peaks that didn't lift to ancestral or that failed to lift back to other species), will be tested further in round 2
getMultiSpeciesSpecificPeaks <- function(species, species_on_primate_peaks2, peaks, consensus_peaks, consensus_df){
species_orig <- vector(mode = 'list', length = length(species)); names(species_orig) <- species
merged <- vector(mode = 'list', length = length(species)); names(merged) <- species
species_spec_peaks1 <- vector(mode = 'list', length = length(species)); names(species_spec_peaks1) <- species
fail_codes = c('Failed','Problem','Blacklist','Chr_removed')
for (sp in species){
#Part1
if (!is.null(species_on_primate_peaks2[[sp]]$name)){
part1_names = setdiff(peaks[[sp]]$name,species_on_primate_peaks2[[sp]]$name)
part1 = peaks[[sp]][peaks[[sp]]$name %in% part1_names] #peaks originating from species i that failed to lift to ancestral
} else {stop('Names are null.')}
#Part 2
fail_indices = which(consensus_df$liftback_result %in% fail_codes & consensus_df$orig_species == sp)
# fail_names = unique(consensus_df$name[fail_indices])
# part2[[sp]] = consensus_peaks[fail_names] #peaks originating from species i that failed to lift back from ancestral to any species
#Get original coords
part1_orig_names = part1$name
part2_orig_names = consensus_df$orig_name[fail_indices]
orig_names = c(part1_orig_names,part2_orig_names)
matching_indices <- match(peaks[[sp]]$name, orig_names)
species_spec_peaks1[[sp]] =peaks[[sp]][!is.na(matching_indices), ]
}
#Sort by name
species_spec_peaks <- vector(mode = 'list', length = length(species)); names(species_spec_peaks) <- species
for (sp in species){
species_spec_peaks[[sp]] = sortGrByName(species_spec_peaks1[[sp]])
strand(species_spec_peaks[[sp]]) <- '*' #not considering strand info for quantification
}
print('Potential species-specific peaks identified for follow-up.')
return(species_spec_peaks)
}
#Finds peaks overlapping promoters
get_promoter_peaks <- function(peak_granges, annotation, upstream = 1000, downstream = 100){
tss <- GetTSSPositions(annotation)
promoters = expandRange(tss, upstream = upstream, downstream = downstream)
# Find overlaps between peaks and promoters
overlaps <- findOverlaps(peak_granges, promoters)
overlapping_peaks <- queryHits(overlaps)
promoter_peaks_ranges <- peak_granges[overlapping_peaks]
return(promoter_peaks_ranges)
}
#Makes a granges object from astring with a dash separating parts
makeGrangesFromString <- function(ranges_string) {
parts <- strsplit(ranges_string, "-")[[1]]
chromosome <- parts[1]
start <- as.numeric(parts[2])
end <- as.numeric(parts[3])
GRanges(seqnames = chromosome, ranges = IRanges(start = start, end = end))
}
#Gets the names of peaks from a Signac object within a specified range
getPeaksFromPlot <- function(obj,assay,extend_up,extend_down, gene = NULL, range_string = NULL){
annotation = Annotation(obj)
DefaultAssay(obj) = assay
peaks_celltypes = granges(obj)
if (!is.null(gene)){
gene_range <- subset(annotation, gene_name == gene)
extended_range <- GRanges(
seqnames = seqnames(gene_range),
ranges = IRanges(
start = start(gene_range) - extend_up,
end = end(gene_range) + extend_down
),
strand = strand(gene_range))
} else if (!is.null(range)){
range = makeGrangesFromString(range_string)
extended_range <- GRanges(
seqnames = seqnames(range),
ranges = IRanges(
start = start(range) - extend_up,
end = end(range) + extend_down
),
strand = strand(range))
}
#seqlevels(extended_range) <- paste0("chr", seqlevels(extended_range)) # Rename the sequence levels to have the "chr" prefix
overlapping_peaks <- subsetByOverlaps(peaks_celltypes, extended_range)
return(overlapping_peaks)
}
#Gets the names of peaks from a Granges object within a specified range
getPeaksinRange <- function(peaks_celltypes,annotation,extend_up,extend_down, gene = NULL, range_string = NULL){
if (!is.null(gene)){
gene_range <- subset(annotation, gene_name == gene)
extended_range <- GRanges(
seqnames = seqnames(gene_range),
ranges = IRanges(
start = start(gene_range) - extend_up,
end = end(gene_range) + extend_down
),
strand = strand(gene_range))
} else if (!is.null(range)){
range = makeGrangesFromString(range_string)
extended_range <- GRanges(
seqnames = seqnames(range),
ranges = IRanges(
start = start(range) - extend_up,
end = end(range) + extend_down
),
strand = strand(range))
}
#seqlevels(extended_range) <- paste0("chr", seqlevels(extended_range)) # Rename the sequence levels to have the "chr" prefix
overlapping_peaks <- subsetByOverlaps(peaks_celltypes, extended_range)
return(overlapping_peaks)
}
#Categorizes round2 peaks based on separate summit and peak liftovers and indel tolerances
getRound2PeakCategories <- function(species,species_spec_peaks, species_spec_summits, indel_low, indel_med, indel_high,
genome_names,lift_folder, peak_half_width, summit_half_width){
num_species <- length(species); num_other_species <- num_species-1
empty_list <- vector("list", num_species * num_other_species)
#Peaks where the summit lifted with varying indel tolerances - will be added to consensus peaks
species_perfect_peaks <- vector("list", num_species); names(species_perfect_peaks) = species
species_indel_low_peaks<- vector("list", num_species); names(species_indel_low_peaks) = species
species_indel_med_peaks<- vector("list", num_species); names(species_indel_med_peaks) = species
species_indel_high_peaks<- vector("list", num_species); names(species_indel_high_peaks) = species
#Peaks that failed to lift entirely
species_failed_peaks <- vector("list", num_species); names(species_failed_peaks) = species
#Peaks where summit failed but whole peak succeeded with width less than peak width
species_summit_deletion_peaks <- vector("list", num_species); names(species_summit_deletion_peaks) = species
#Peaks where summit lifted but width was so large that summit would move outside of existing peak - set to 0 for other species in count matrix
species_summit_with_large_indel_peaks <- vector("list", num_species); names(species_summit_with_large_indel_peaks) = species
#Peaks where summit failed and width was so large that summit would move outside of existing peak - set to 0 for other species in count matrix
species_summit_failed_large_indel_peaks <- vector("list", num_species); names(species_summit_failed_large_indel_peaks) = species
#Consolidated categories
species_consensus_categories <- vector("list", num_species); names(species_consensus_categories) = species
species_specific_categories <- vector("list", num_species); names(species_specific_categories) = species
for (sp in species){
other_species = setdiff(species,sp)
species_perfect_peaks[[sp]] <- vector("list", num_other_species); names(species_perfect_peaks[[sp]]) = other_species
species_indel_low_peaks[[sp]] <- vector("list", num_other_species); names(species_indel_low_peaks[[sp]]) = other_species
species_indel_med_peaks[[sp]] <- vector("list", num_other_species); names(species_indel_med_peaks[[sp]]) = other_species
species_indel_high_peaks[[sp]] <- vector("list", num_other_species); names(species_indel_high_peaks[[sp]]) = other_species
species_failed_peaks[[sp]] <- vector("list", num_other_species); names(species_failed_peaks[[sp]]) = other_species
species_summit_deletion_peaks[[sp]] <- vector("list", num_other_species); names(species_summit_deletion_peaks[[sp]]) = other_species
species_summit_with_large_indel_peaks[[sp]] <- vector("list", num_other_species); names(species_summit_with_large_indel_peaks[[sp]]) = other_species
species_summit_failed_large_indel_peaks[[sp]] <- vector("list", num_other_species); names(species_summit_failed_large_indel_peaks[[sp]]) = other_species
species_consensus_categories[[sp]] <- vector("list", num_other_species); names(species_consensus_categories[[sp]]) = other_species
species_specific_categories[[sp]] <- vector("list", num_other_species); names(species_specific_categories[[sp]]) = other_species
for (sp_other in other_species){
num_invalid = 0 #counter for peaks that were excluded due to ranges exceeding limits
filename = paste0(lift_folder,sp,'_on_',sp_other,'_spec_summits')
print(paste0('Processing ', sp, ' on ', sp_other,' peaks:'))
lo <- import(paste0(filename,'_lo.bed'))
result <- combineLiftoverFragments(lo)
summits_lo_all = result[[1]]; summits_lo_multiple = result[[2]]
names(summits_lo_all) = summits_lo_all$name
genome(summits_lo_all) = genome_names[[sp_other]]
result = summarizeLiftoverResults(species_spec_summits[[sp]],summits_lo_all,summits_lo_multiple, summit_half_width)
saveRDS(result$summary, paste0(filename,'_summary.rds'))
summit_failed_names <- result$names_failed
#Perfect summits - add to consensus peaks as is (low indel_tol)
species_perfect_summits = summits_lo_all[summits_lo_all$name %in% result$names_same]
summits_lo1 = setDiffForPeakNames(summits_lo_all,species_perfect_summits)
#Summits within indel_tol 5 - add to consensus peaks as is (low_indel_tol)
species_indel_low_summits <- excludeDifferentWidthsSummits(summits_lo1, indel_low, summit_half_width)
summits_lo2 = setDiffForPeakNames(summits_lo1,species_indel_low_summits)
#Summits within indel_tol 50 - add to consensus peaks with med_indel_tol
species_indel_med_summits <- excludeDifferentWidthsSummits(summits_lo2, indel_med, summit_half_width)
summits_lo3 = setDiffForPeakNames(summits_lo2,species_indel_med_summits)
#Summits within indel_tol 250 - add to consensus peaks with high_indel_tol
species_indel_high_summits <- excludeDifferentWidthsSummits(summits_lo3, indel_high, summit_half_width)
summits_lo4 = setDiffForPeakNames(summits_lo3,species_indel_high_summits)
species_summit_with_large_indel_peaks[[sp]][[sp_other]] = species_spec_peaks[[sp]][species_spec_peaks[[sp]]$name %in% summits_lo4$name] #get original peaks for that species
species_summit_with_large_indel_peaks[[sp]][[sp_other]] = sortGrByName(species_summit_with_large_indel_peaks[[sp]][[sp_other]]); species_summit_with_large_indel_peaks[[sp]][[sp_other]]$category = 'round2_summit_with_large_indel'
#Check that all peaks are accounted for
num_peaks = length(summits_lo4) + length(species_indel_high_summits) +length(species_indel_med_summits) + length(species_indel_low_summits) +length(species_perfect_summits)
if (num_peaks != length(summits_lo_all)){stop('Number of successfully lifted potential species-specific summits does not add up.')}
#Rebuild peaks around summits
species_perfect_peaks[[sp]][[sp_other]] <- makePeaksFromSummitRanges(species_perfect_summits,peak_half_width)
species_perfect_peaks[[sp]][[sp_other]] = sortGrByName(species_perfect_peaks[[sp]][[sp_other]]); species_perfect_peaks[[sp]][[sp_other]]$category = 'round2_indel_low'
num_invalid = num_invalid + length(species_perfect_summits) - length(species_perfect_peaks[[sp]][[sp_other]])
species_indel_low_peaks[[sp]][[sp_other]] <- makePeaksFromSummitRanges(species_indel_low_summits,peak_half_width)
species_indel_low_peaks[[sp]][[sp_other]] = sortGrByName(species_indel_low_peaks[[sp]][[sp_other]]); species_indel_low_peaks[[sp]][[sp_other]]$category = 'round2_indel_low'
num_invalid = num_invalid + length(species_indel_low_summits) - length(species_indel_low_peaks[[sp]][[sp_other]])
species_indel_med_peaks[[sp]][[sp_other]] <- makePeaksFromSummitRanges(species_indel_med_summits,peak_half_width)
species_indel_med_peaks[[sp]][[sp_other]] = sortGrByName(species_indel_med_peaks[[sp]][[sp_other]]); species_indel_med_peaks[[sp]][[sp_other]]$category = 'round2_indel_med'
num_invalid = num_invalid + length(species_indel_med_summits) - length(species_indel_med_peaks[[sp]][[sp_other]])
species_indel_high_peaks[[sp]][[sp_other]] <- makePeaksFromSummitRanges(species_indel_high_summits,peak_half_width)
species_indel_high_peaks[[sp]][[sp_other]] <- sortGrByName(species_indel_high_peaks[[sp]][[sp_other]]); species_indel_high_peaks[[sp]][[sp_other]]$category = 'round2_indel_high'
num_invalid = num_invalid + length(species_indel_high_summits) - length(species_indel_high_peaks[[sp]][[sp_other]])
#Get peaks that failed to lift over entirely
filename = paste0(lift_folder,sp,'_on_',sp_other,'_spec_peaks')
lo <- import(paste0(filename,'_lo.bed'))
result <- combineLiftoverFragments(lo)
peaks_lo_all = result[[1]]; peaks_lo_multiple = result[[2]]
names(peaks_lo_all) = peaks_lo_all$name
genome(peaks_lo_all) = genome_names[[sp_other]]
result = summarizeLiftoverResults(species_spec_peaks[[sp]],peaks_lo_all,peaks_lo_multiple, peak_half_width)
species_peaks_failed_summits = species_spec_peaks[[sp]][species_spec_peaks[[sp]]$name %in% result$names_failed]
species_failed_peaks[[sp]][[sp_other]] <- makePeaksFromSummitRanges(species_peaks_failed_summits,peak_half_width)
species_failed_peaks[[sp]][[sp_other]] <- sortGrByName(species_failed_peaks[[sp]][[sp_other]]); species_failed_peaks[[sp]][[sp_other]]$category = 'round2_peak_failed'
num_invalid = num_invalid + length(species_peaks_failed_summits) - length(species_failed_peaks[[sp]][[sp_other]])
saveRDS(result$summary, paste0(filename,'_summary.rds'))
#Species summit deletion peaks
#summit failed but whole peak succeeded
summit_failed_peak_success = intersect(summit_failed_names,peaks_lo_all$name)
species_summit_deletion_all = peaks_lo_all[peaks_lo_all$name %in% summit_failed_peak_success]
species_summit_deletion_summits = species_summit_deletion_all[width(species_summit_deletion_all)<2*peak_half_width+1]
species_summit_deletion_peaks[[sp]][[sp_other]] = makePeaksFromSummitRanges(species_summit_deletion_summits,peak_half_width)
species_summit_deletion_peaks[[sp]][[sp_other]] <- sortGrByName(species_summit_deletion_peaks[[sp]][[sp_other]]); species_summit_deletion_peaks[[sp]][[sp_other]]$category = 'round2_summit_deletion'
num_invalid = num_invalid + length(species_summit_deletion_summits) - length(species_summit_deletion_peaks[[sp]][[sp_other]])
species_summit_failed_large_indel_summits_othersp = setDiffForPeakNames(species_summit_deletion_all,species_summit_deletion_summits)
species_summit_failed_large_indel_peaks[[sp]][[sp_other]] = species_spec_peaks[[sp]][species_spec_peaks[[sp]]$name %in% species_summit_failed_large_indel_summits_othersp$name] #get original peaks for that species
species_summit_failed_large_indel_peaks[[sp]][[sp_other]]<- sortGrByName(species_summit_failed_large_indel_peaks[[sp]][[sp_other]]); species_summit_failed_large_indel_peaks[[sp]][[sp_other]]$category = 'round2_summit_failed_large_indel'
#summit failed and whole peak was multi-mapping or summit and peak were both multimapping
summit_failed_peak_multi = intersect(summit_failed_names, peaks_lo_multiple$name)
summit_multi_peak_multi = intersect(summits_lo_multiple$name, peaks_lo_multiple$name)
all_multi = c(summit_failed_peak_multi,summit_multi_peak_multi)
total_multi = length(unique(all_multi))
#Consolidate into consensus and species-specific categories
species_consensus_categories[[sp]][[sp_other]] = combineGrs(species_perfect_peaks[[sp]][[sp_other]],species_indel_low_peaks[[sp]][[sp_other]],species_indel_med_peaks[[sp]][[sp_other]],
species_indel_high_peaks[[sp]][[sp_other]],species_summit_deletion_peaks[[sp]][[sp_other]])
species_consensus_categories[[sp]][[sp_other]] = sortGrByName(species_consensus_categories[[sp]][[sp_other]])
species_specific_categories[[sp]][[sp_other]] = combineGrs(species_summit_with_large_indel_peaks[[sp]][[sp_other]], species_failed_peaks[[sp]][[sp_other]],
species_summit_failed_large_indel_peaks[[sp]][[sp_other]])
species_specific_categories[[sp]][[sp_other]] = sortGrByName(species_specific_categories[[sp]][[sp_other]])
#Summarize
category = c('Peaks with summits that lifted perfectly', 'Peaks with summits that lifted with low indel tolerance',
'Peaks with summits that lifted with med indel tolerance','Peaks with summits that lifted with high indel tolerance',
'Peaks with summits that lifted but with indels larger than peak width', 'Peaks that failed to lift in entirety',
'Peaks with summit deletions', 'Peaks with summit deletions and large insertions elsewhere',
'Peaks with multi-mapping summits or failed summits and multi-mapping peaks', 'Total')
values = c(length(species_perfect_peaks[[sp]][[sp_other]]) , length(species_indel_low_peaks[[sp]][[sp_other]]) , length(species_indel_med_peaks[[sp]][[sp_other]]) ,
length(species_indel_high_peaks[[sp]][[sp_other]]) , length(species_summit_with_large_indel_peaks[[sp]][[sp_other]]) , length(species_failed_peaks[[sp]][[sp_other]]) ,
length(species_summit_deletion_peaks[[sp]][[sp_other]]) , length(species_summit_failed_large_indel_peaks[[sp]][[sp_other]]), total_multi, length(species_spec_summits[[sp]]))
summary = data.frame(category,values); print(summary)
#Check that all peaks are unique and no overlap between categories
num_total = sum(values[1:(length(values)-1)]) + num_invalid
if (length(species_spec_summits[[sp]]) != num_total){stop('Number of species-specific summits across all categories (',num_total, ') does not add up to the total.')}
overlap = intersect(species_consensus_categories[[sp]][[sp_other]]$name, species_specific_categories[[sp]][[sp_other]]$name)
if (length(overlap)!=0){stop('Some peaks are in multiple categories.')}
saveRDS(summary,paste0(lift_folder, sp, '_species_specific_peaks_summary.rds'))
}
}
return(list(species_consensus_categories = species_consensus_categories, species_specific_categories = species_specific_categories))
}
'%!in%' <- function(x,y)!('%in%'(x,y))
#Initializes metadata during creation of round 1 consensus peak set
initializeMetadataDataFrames <- function(speciesList, speciesNames) {
metadataList <- list()
for (i in seq_along(speciesList)) {
gr <- speciesList[[i]]
df <- data.frame(
consensus_name = character(length(gr)),
original_peak_name = mcols(gr)$name,
original_chr = seqnames(gr),
original_start = start(gr),
original_end = end(gr),
strand_flip = strand(gr) == "-",
summit_adj = numeric(length(gr)), # Initially zero
stringsAsFactors = FALSE
)
metadataList[[speciesNames[i]]] <- df
}
return(metadataList)
}
is_odd <- function(x) {
if (x %% 2 != 0) {
return(TRUE)
} else {
return(FALSE)
}
}
#Checks whether a granges object is sorted by the peak name
is_sorted_by_peak_name <- function(gr) {
# Extract numeric part of the peak names
peak_numbers <- as.numeric(gsub(".*_([0-9]+)$", "\\1", mcols(gr)$name))
# Check if the numeric parts are in ascending order
all(diff(peak_numbers) > 0)
}
#Creates a names column with named peaks for each species according to the defined prefixes
labelPeaksWithPrefixes <- function(summits, current_species,prefixes, keep.extra.columns = FALSE){
current_prefix <- tryCatch({
prefixes[[current_species]]
}, error = function(e) {
"Un"
})
if (!keep.extra.columns){
mcols(summits) <- NULL
}
peak_names <- paste0(current_prefix, '_peak_', seq_along(summits))
mcols(summits)$name <- peak_names
return(summits)
}
#Makes fixed width peaks from granges objects If summit_col == NULL then it assumes summit is centered within summit range and extends
#both sides to create a peak with given half width. Otherwise it takes the summit as the start position plus the value in summit_col
makePeaksFromSummitRanges <- function(gr, peak_half_width, summit_col = NULL) {
if (is.null(summit_col)){
summit_positions = mid(gr)
starts <- summit_positions - peak_half_width
ends <- summit_positions + peak_half_width
} else {
summit_positions = mcols(gr)[[summit_col]]
starts <- start(gr) + summit_positions - peak_half_width
ends <- start(gr) + summit_positions + peak_half_width
}
# Filter out peaks where the start would be negative
valid_indices <- which(starts >= 1)
if (length(valid_indices) > 0) {
new_gr <- GRanges(seqnames = seqnames(gr)[valid_indices],
ranges = IRanges(start = starts[valid_indices], end = ends[valid_indices]),
strand = strand(gr)[valid_indices])
# Assigning metadata columns
mcols(new_gr) <- mcols(gr)[valid_indices,]
if("X" %in% names(mcols(new_gr))) {
names(mcols(new_gr))[names(mcols(new_gr)) == "X"] <- "name"
}
new_gr <- sort(new_gr)
names(new_gr) <- new_gr$name
genome(new_gr) = genome(gr)
}
num_invalid = length(which(starts < 1))
print(paste0('Peaks made from summit ranges (', num_invalid,' summits had invalid starts.)'))