Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Wytamma/GISAIDR
Browse files Browse the repository at this point in the history
  • Loading branch information
Wytamma committed Apr 25, 2023
2 parents f496a54 + f21b25d commit 71a887a
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Set up R ${{ matrix.config.os }} (${{ matrix.config.r }})
uses: r-lib/actions/setup-r@v1
uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
Expand Down
35 changes: 35 additions & 0 deletions R/internal_query.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#' @param collection_date_complete include only entries with complete in collection date the results.
#' @param total returns the total number of sequences matching the query.
#' @param fast returns all of the accession_ids that match the query.
#' @param aa_substitution returns all sequences with the amino acid mutation(s), negative selection by '-' prefix
#' @param nucl_mutation returns all sequences with the nucleotide mutation(s), negative selection by '-' prefix
#' @return Dataframe.
internal_query <-
function(credentials,
Expand All @@ -34,6 +36,8 @@ internal_query <-
to_subm = NULL,
virus_name = NULL,
order_by = NULL,
aa_substitution = NULL,
nucl_mutation = NULL,
order_asc = TRUE,
start_index = 0,
nrows = 50,
Expand Down Expand Up @@ -170,6 +174,35 @@ internal_query <-
)
}

# amino acid changes
if (!is.null(aa_substitution)) {
queue <-
append(
queue,
create_search_queue(
credentials,
credentials$aa_substitution_ceid,
aa_substitution,
'FilterChange'
)
)
}

# nucleotide changes
if (!is.null(nucl_mutation)) {
queue <-
append (
queue,
create_search_queue(
credentials,
credentials$nucl_mutation_ceid,
nucl_mutation,
'FilterChange'
)
)
}


if (low_coverage_excl) {
queue <-
append(
Expand Down Expand Up @@ -342,6 +375,8 @@ internal_query <-
from_subm = from_subm,
to = to,
to_subm = to_subm,
aa_substitution = aa_substitution,
nucl_mutation = nucl_mutation,
nrows = j$totalRecords,
# set load_all to false to break the recursion
load_all = FALSE,
Expand Down
12 changes: 12 additions & 0 deletions R/login.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ login <- function(username, password, database="EpiCoV") {
linage_ceid <- NULL
}

# AA substitution/mutation- ", " separated values
aa_substitution_ceid <- extract_search_ceid('mutation', customSearch_page_text)
# nucleotide substitution/nuc mutation, ", " separated values
nucl_mutation_ceid <- extract_search_ceid('nuc_mutation', customSearch_page_text)
# Virus Name
virus_name_ceid <- extract_search_ceid('covv_virus_name', customSearch_page_text)

Expand Down Expand Up @@ -250,6 +254,12 @@ login <- function(username, password, database="EpiCoV") {
# collection date complete
collection_date_complete_ceid <-
extract_search_ceid('quality2', customSearch_page_text)
# AA substitution
aa_substitution_ceid <-
extract_search_ceid('mutation', customSearch_page_text)
# nucleotide mutation
nucl_mutation_ceid <-
extract_search_ceid('nuc_mutation', customSearch_page_text)
}

# send selection command
Expand Down Expand Up @@ -281,6 +291,8 @@ login <- function(username, password, database="EpiCoV") {
text_ceid = text_ceid,
location_ceid = location_ceid,
search_cid = search_cid,
aa_substitution_ceid = aa_substitution_ceid,
nucl_mutation_ceid = nucl_mutation_ceid,
linage_ceid = linage_ceid,
virus_name_ceid = virus_name_ceid,
from_ceid = from_ceid,
Expand Down
14 changes: 12 additions & 2 deletions R/query.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#' @param collection_date_complete include only entries with complete in collection date the results.
#' @param total returns the total number of sequences matching the query.
#' @param fast returns all of the accession_ids that match the query.
#' @param aa_substitution_ceid returns all sequences with the selected amino acid mutation
#' @param nucl_mutation_ceid returns all sequences with the selected nucleotide mutation
#' @return data.frame
query <-
function(credentials,
Expand All @@ -35,6 +37,8 @@ query <-
to_subm = NULL,
virus_name = NULL,
order_by = NULL,
aa_substitution = NULL,
nucl_mutation = NULL,
order_asc = TRUE,
start_index = 0,
nrows = 50,
Expand Down Expand Up @@ -69,7 +73,9 @@ query <-
low_coverage_excl = low_coverage_excl,
complete = complete,
high_coverage = high_coverage,
collection_date_complete = collection_date_complete
collection_date_complete = collection_date_complete,
aa_substitution = aa_substitution,
nucl_mutation = nucl_mutation
))
}
return(results)
Expand All @@ -93,6 +99,8 @@ query <-
complete = complete,
high_coverage = high_coverage,
collection_date_complete = collection_date_complete,
aa_substitution = aa_substitution,
nucl_mutation = nucl_mutation,
total = total,
fast = fast
)
Expand All @@ -117,7 +125,9 @@ query <-
low_coverage_excl = low_coverage_excl,
complete = complete,
high_coverage = high_coverage,
collection_date_complete = collection_date_complete
collection_date_complete = collection_date_complete,
aa_substitution = aa_substitution,
nucl_mutation = nucl_mutation
)
)
}
Expand Down
42 changes: 20 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,27 @@ total

## Download

To download the full data set you need a list of accession IDs (which can be obtained from `query` results).
To download the full data set you need a list of accession IDs (which can be obtained from `query` results). This will also download the sequence data for each entry.

``` r
full_df_with_seq <- download(
credentials = credentials,
list_of_accession_ids = list_of_accession_ids,
)
full_df_with_seq$sequence
```

[1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...
[2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...
[3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...
...

You can stop GISAIDR from loading the sequence data into the memory by setting get_sequence=FALSE. Note: the sequence data will still be downloaded.

``` r
df <- query(credentials = credentials)
list_of_accession_ids <- df$accession_id
full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids)
full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids, get_sequence=FALSE)
colnames(full_df)
```

Expand All @@ -302,24 +317,6 @@ colnames(full_df)

Note: a maximum of 5000 results can be downloaded at a time.

### Get sequence data

Use the `get_sequence` argument to download the sequences with the full data.

``` r
full_df_with_seq <- download(
credentials = credentials,
list_of_accession_ids = list_of_accession_ids,
get_sequence=TRUE
)
full_df_with_seq$sequence
```

[1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...
[2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...
[3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...
...

### Export to fasta file

Use the export_fasta function to write sequence data to a file in fasta format. The sequence names will be [country\@pango_lineage\@accesion_id\@date](mailto:country@pango_lineage@accesion_id@date), with the date in decimal format (requires the [lubridate](https://cran.r-project.org/web/packages/lubridate/index.html) package). The default is to only export sequences for which a decimal date could be set. To prevent this, use the argument export_dated_only = F.
Expand Down Expand Up @@ -417,8 +414,9 @@ quality_ceid <- extract_search_ceid("quality'", customSearch_page_text)
```

1. Add the extracted `ceid` to the list of `credentials` e.g. `complete_ceid = complete_ceid`
2. Add the new argument and default value to the `query()` function in `query.R` e.g. `complete = FALSE`.
3. Create and append a search queue to the main queue if the `complete` argument is used. Create the command using the `create_search_queue()` function. Use the `complete_ceid` for the `ceid` and the checkbox value (identified in step 4) for the `cvalue` e.g.
2. Add the new argument and default value to **all** `query()` function in `query.R` and `internal_query.R` e.g. `complete = FALSE`.
3. Add the new argument and default value to the `(load_all && j$totalRecords > nrows)` load_all recursion loop so all paginations will continue using the argument.
4. Create and append a search queue to the main queue if the `complete` argument is used. Create the command using the `create_search_queue()` function. Use the `complete_ceid` for the `ceid` and the checkbox value (identified in step 4) for the `cvalue` e.g.

``` r
if (complete) {
Expand Down
23 changes: 23 additions & 0 deletions tests/testthat/test-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,29 @@ test_that("order_by works", {
expect_true(df$submission_date[1] == "2020-01-10")
})

test_that("aa_substitution works", {
df <- query(credentials = credentials,
aa_substitution = 'Spike_E484Q, Spike_H69del, -N_P13L',
to_subm = '2023-02-22',
load_all = TRUE,
order_by='submission_date')
expect_true(is.data.frame(df))
expect_equal(df$submission_date[1], "2021-01-25")
expect_equal(nrow(df),576)
## to test accuracy - set of 4 rarely co-existing mutations to verify Spike_H69del, Spike_A222V, Spike_G476S, -N_P13L
})

test_that("nucl_mutation works", {
df <- query(credentials = credentials,
nucl_mutation = '-T23599G, -C10029T, -C14408T, -A23403G, T22679C, G28881A, A24424T',
to_subm = '2023-02-22',
load_all = TRUE,
order_by='submission_date')
expect_true(is.data.frame(df))
expect_equal(df$submission_date[1],"2021-12-29")
expect_equal(nrow(df),55)
})

test_that("text search works", {
accession_ids = c("EPI_ISL_17398411", "EPI_ISL_17199001", "EPI_ISL_17409201", "EPI_ISL_17243716")
df <- query(credentials = credentials, text = paste(accession_ids, collapse = "\n"))
Expand Down

0 comments on commit 71a887a

Please sign in to comment.