Skip to content

Commit

Permalink
changed to use the extraction codes from RSV-NET directlyi, replacing…
Browse files Browse the repository at this point in the history
… of copy_rsv_data.yaml
  • Loading branch information
kjsato committed Sep 19, 2024
1 parent f8ab0b7 commit f07593b
Show file tree
Hide file tree
Showing 15 changed files with 60,588 additions and 0 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/update-rsv-data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Update RSV data

on:
workflow_dispatch:
schedule:
- cron: '0 9 * * 5'

jobs:
update-rsv-data:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v3
with:
python-version: '3.10'

- name: Install dependencies
run: pip install -r requirements.txt

- name: Run Python
run: |
python auxiliary-data/rsv/extract_rsv.py
- name: Commit
run: |
git config user.name Github-Actions
git config user.email "<>"
if [ -n "$(git status --porcelain)" ]; then
git add .
git commit -m "Update target data $(date)"
git push
echo "Target data updated";
else
echo "no updates";
fi
42 changes: 42 additions & 0 deletions .github/workflows/update-target.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Update target data

on:
workflow_dispatch:
workflow_run:
workflows: [Update RSV data]
types:
- completed

jobs:
update-target-data:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Setup R
uses: r-lib/actions/setup-r@v2

- name: Install system dependencies
run: sudo apt-get install libcurl4-openssl-dev libudunits2-dev libgdal-dev

- name: Install dependencies
run: |
R -e 'install.packages(c("lubridate", "dplyr", "tidyr", "arrow"))'
- name: Update target data
run: Rscript target-data/get_target_data.R

- name: Commit
run: |
git config user.name Github-Actions
git config user.email "<>"
if [ -n "$(git status --porcelain)" ]; then
git add .
git commit -m "Update target data $(date)"
git push
echo "Target data updated";
else
echo "no updates";
fi
13,573 changes: 13,573 additions & 0 deletions auxiliary-data/location_census/state_pop_data.csv

Large diffs are not rendered by default.

149 changes: 149 additions & 0 deletions auxiliary-data/rsv/extract_rsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import os
import io
import requests
from bs4 import BeautifulSoup
from datetime import date
import pandas as pd


def get_table_headers(table):
table_headers = []
# the first tr contain the column headers
# skip the first "column header" because it's blank and for the row ids we are not keeping
for th in table.find("tr").find_all("th")[1:]:
table_headers.append(th.text.strip().lower().replace(" ", "_"))
return table_headers


def get_table_rows(table):
table_rows = []
# skip the first tr as that was the column header
for tr in table.find_all("tr")[1:]:
# skip the first column in each row as it is a row number
tds = tr.find_all("td")[1:]
cells = []
for td in tds:
cells.append(td.text.strip())
table_rows.append(cells)
return table_rows


def historical_date(df, file_dir, date_colname):
today = date.today()
today_str = today.strftime("%Y-%m-%d")
df[date_colname] = pd.to_datetime(df[date_colname]).dt.strftime("%Y-%m-%d")
df["as_of"] = today_str

if os.path.isfile(file_dir):
df0 = pd.read_csv(file_dir)
df_ts = set(list(df[date_colname].drop_duplicates())).intersection(
list(df0[date_colname].drop_duplicates()))
df0 = df0[~df0[date_colname].isin(df_ts)]
df_all = pd.concat([df0, df], ignore_index=True)
else:
df_all = df
return df_all


def get_nrevss_data(url_bases, url_ext, locations, location_name=None, folder_path="auxiliary-data/rsv/nrevss/"):

for u in url_bases:
headers = []
rows = []
filename = ""

for location in locations:
if location_name is None:
loc_name = location
else:
loc_name = location_name
try:
r = requests.get(u + location + url_ext)
if r.status_code == 200:

# parse the html
soup = BeautifulSoup(r.text, 'html.parser')

# search for the <table> tags
tables = soup.find_all("table")

for t in tables:
filename = (t.caption.text.strip().replace("for " + loc_name, "").
replace("Insufficient Antigen Data: ", "").
replace("Insufficient Data: ", "").lower().
replace(" ", "_").replace("(", "").replace(")", "") + ".csv")
headers = get_table_headers(t)
state_rows = get_table_rows(t)
rows.extend(state_rows)

else:
print("Error getting page; HTTP status " + str(r.status_code))

except Exception as e:
print("Error for state " + location + " and URL " + u + location + url_ext)
print(e)

# Add date information
df = pd.DataFrame(rows)
df = df.rename(columns=dict(zip(range(len(headers)), headers)))
df["repweekdate"] = pd.to_datetime(df["repweekdate"]).dt.strftime("%Y-%m-%d")
file_dir = folder_path + filename
df_all = historical_date(df, file_dir, "repweekdate")
# write the csv file
df_all.to_csv(file_dir, index=False)


def get_nssp_data(req_res, file_name, folder_path="auxiliary-data/rsv/nssp/", date_colname="End Date of MMWR Week"):
df_res = pd.read_csv(io.StringIO(req_res.text))
df_res["as_of"] = date.today().strftime("%Y-%m-%d")
df_res[date_colname] = pd.to_datetime(
df_res[date_colname]).dt.strftime("%Y-%m-%d")
file_dir = folder_path + file_name
df_res_all = historical_date(df_res, file_dir, date_colname)
df_res_all.to_csv(file_dir, index=False)


# RSV State Trends - The National Respiratory and Enteric Virus Surveillance System (NREVSS)
# state_bases = [
# "https://www.cdc.gov/surveillance/nrevss/images/rsvstate/RSV1PPCent3AVG_State",
# "https://www.cdc.gov/surveillance/nrevss/images/rsvstate/RSV4PPCent3AVG_State",
# "https://www.cdc.gov/surveillance/nrevss/images/rsvstate/RSV14NumCent5AVG_State"]
# hub_locations = pd.read_csv("auxiliary-data/location_census/locations.csv")
# states = list(hub_locations["abbreviation"])
# to_remove = ["AS", "GU", "MP", "PR", "UM", "VI", "US"]
# for i in to_remove:
# states.remove(i)
# get_nrevss_data(state_bases, ".htm", states)

# RSV National Trends - The National Respiratory and Enteric Virus Surveillance System (NREVSS)
# nat_bases = {"https://www.cdc.gov/surveillance/nrevss/images/trend_images/RSV124PP_",
# "https://www.cdc.gov/surveillance/nrevss/images/trend_images/RSV14Num_"}
# get_nrevss_data(nat_bases, ".htm", ["Nat"], location_name="the ")

# Weekly Rates of Laboratory-Confirmed RSV Hospitalizations - Respiratory Syncytial Virus Hospitalization
# Surveillance Network (RSV-NET)
res = requests.get("https://data.cdc.gov/api/views/29hc-w46k/rows.csv?accessType=DOWNLOAD")
df_rsvnet = pd.read_csv(io.StringIO(res.text),
dtype={'State': "string", 'Season': "string", 'MMWR Week': "string",
'Week ending date': "string", 'Age Category': "string",
'Sex': "string", 'Race': "string", 'Rate': "float",
'Cumulative Rate': "float"})
df_rsvnet.to_parquet("auxiliary-data/rsv/rsv-net/" + str(date.today()) +
"_weekly_rates_lab_confirmed_rsv_hosp.parquet")

# Weekly Respiratory Virus Vaccination Data, Children 6 Months-17 Years and Adults 18 Years and
# Older, National Immunization Survey
res = requests.get("https://data.cdc.gov/resource/5c6r-xi2t.csv")
df_rsvvax = pd.read_csv(io.StringIO(res.text))
df_rsvvax.to_parquet("auxiliary-data/rsv/rsv-vax/weekly_respiratory_virus_vaccination_data.parquet")

# National Emergency Department Visits for COVID-19, Influenza, and Respiratory Syncytial Virus
# Weekly Emergency Department Visits by Age Group
res = requests.get("https://www.cdc.gov/wcms/vizdata/live/CSELS_DHIS_NSSP/Resp_ED_Count_Weekly_National.csv")
get_nssp_data(res, "Resp_ED_Count_Weekly_National.csv".lower())

res = requests.get("https://www.cdc.gov/wcms/vizdata/live/CSELS_DHIS_NSSP/Resp_ED_Percent_Weekly_National.csv")
get_nssp_data(res, "Resp_ED_Percent_Weekly_National.csv".lower())

res = requests.get("https://data.cdc.gov/api/views/vutn-jzwm/rows.csv?accessType=DOWNLOAD")
get_nssp_data(res, "Resp_ED_Percent_Weekly_state.csv".lower(), date_colname="week_end")
Loading

0 comments on commit f07593b

Please sign in to comment.