From 495a666c6181fd4bfd583233ef5107ab963d4a06 Mon Sep 17 00:00:00 2001
From: Nicholas Reich <nick@schoolph.umass.edu>
Date: Sun, 20 May 2018 08:13:21 -0400
Subject: [PATCH] incorporating Evan's changes from PR #138. Also, updates to
 manuscript, bib, and model table.

---
 scripts/point-estimate-scores.R               |   8 +-
 writing/comparison/comparison-manuscript.Rnw  | 534 ++++++++++--------
 .../comparison/static-figures/model-table.tex |  50 +-
 writing/flusightnetwork.bib                   |  53 ++
 4 files changed, 373 insertions(+), 272 deletions(-)
diff --git a/scripts/point-estimate-scores.R b/scripts/point-estimate-scores.R
index d17f6b96..2d763399 100644
--- a/scripts/point-estimate-scores.R
+++ b/scripts/point-estimate-scores.R
@@ -43,9 +43,11 @@ ests_with_truth <- tmp.df %>%
     dplyr::rename(Calendar.Week = `Calendar Week`) %>% 
     left_join(truths) %>%
     mutate(
+        target_type = ifelse(Target %in% c("Season peak week", "Season onset"), "Week", "wILI"),
         obs_value = as.numeric(as.character(Valid.Bin_start_incl)),
-        err = Value - obs_value
-        )
-    
+        # todo: fix error calculation for week targets
+        err = ifelse(target_type == "wILI", Value - obs_value, Value - obs_value)
+    )
+
 
 write.csv(ests_with_truth, file="scores/point_ests.csv", quote = FALSE, row.names = FALSE)
diff --git a/writing/comparison/comparison-manuscript.Rnw b/writing/comparison/comparison-manuscript.Rnw
index 349d7624..25265ee6 100644
--- a/writing/comparison/comparison-manuscript.Rnw
+++ b/writing/comparison/comparison-manuscript.Rnw
@@ -2,7 +2,7 @@
 
 \title{Forecasting Seasonal Influenza in the U.S.: A collaborative multi-year, multi-model assessment of forecast performance}
 
-\author{Nicholas G Reich, Matthew Biggerstaff, Logan Brooks, Spencer Fox, Michael Johansson, Sasikiran Kandula, Craig McGowan, Evan Moore, Dave Osthus, Evan Ray, Roni Rosenfeld, Jeffrey Shaman, Abhinav Tushar, Teresa Yamana}
+\author{Nicholas G Reich, Logan Brooks, Spencer Fox, Sasikiran Kandula, //Craig McGowan, Evan Moore, Dave Osthus, Evan Ray, Abhinav Tushar, Teresa Yamana, \\Matthew Biggerstaff, Michael Johansson, Roni Rosenfeld, Jeffrey Shaman}
 
 \usepackage[letterpaper, margin=1in]{geometry} % margin
 \usepackage{lineno}% add line numbers
@@ -40,7 +40,7 @@
 
 \end{abstract}
 
-\section*{Significance Statement}
+\subsection*{Significance Statement}
 120 word
 
 %\tableofcontents
@@ -198,6 +198,12 @@ scores_by_region_targettype <- scores_adj %>%
 @
 
 
+<<calc-totals>>=
+n_models <- nrow(scores_by_model)
+n_seasons <- nrow(scores_by_season)
+n_regions <- nrow(scores_by_region)
+@
+
 
 \section{Introduction}
 
@@ -212,41 +218,27 @@ In part due to the growing recognition of the importance of systematically integ
 %These studies serve as an important counterweight to many forecasting efforts remain one-off academic exercises focused on a single application or a single method.
 By enabling a standardized comparison in a single application, these studies greatly improve our understanding of which models perform best in certain settings, of how results can best be disseminated and used by decision-makers, and of what the bottlenecks are in terms of improving forecasts.
 
-The aim of this study is to present a standardized comparison of a range of different forecasting models for influenza in the US, over multiple seasons.
-Our work brings together 22 models from five different institutions: Carnegie Mellon University, Columbia University, Los Alamos National Laboratory, University of Massachusetts-Amherst, and University of Texas-Austin.
-While most groups developed more than one model for consideration, the models developed within a single group through the use of common data sources and/or methodologies often bear similarities to each other.
-Having models from different teams enhances the diversity of the models presented.
-
 While such multi-model comparisons exist in the literature for single-season performance, here we compare the same models over seven flu seasons. 
-To our knowledge, this is the first documented comparison of multiple real-time forecasting models (from different teams) across multiple seasons for any infectious disease application.
+To our knowledge, this is the first documented comparison of multiple real-time forecasting models from different teams across multiple seasons for any infectious disease application.
 Since each season has unique dynamical structure, multi-season comparisons like this have great potential to improve our understanding of how models perform over the long term and which models may be reliable in the future.
 
-This work relies on the forecasting structure developed by existing public forecasting challenges.
-Starting in the 2013/2014 influenza season, the U.S. Centers for Disease Control and Prevention (CDC) has run the ``Forecast the Influenza Season Collaborative Challenge'' (a.k.a. FluSight) each influenza season, soliciting prospective, real-time weekly forecasts for specific influenza season metrics from teams across the world.\cite{Biggerstaff2016,Biggerstaff2018}
+%This work relies on the forecasting structure developed by existing public forecasting challenges.
+Starting in the 2013/2014 influenza season, the U.S. Centers for Disease Control and Prevention (CDC) has run the ``Forecast the Influenza Season Collaborative Challenge'' (a.k.a. FluSight) each influenza season, soliciting prospective, real-time weekly forecasts of regional-level influenza-like illness measures from teams across the world (Figure \ref{fig:intro-schematic}).\cite{Biggerstaff2016,Biggerstaff2018}
 These forecasts are displayed together on a website in real-time and are evaluated for accuracy at the end of the season.\cite{PhiResearchLab} 
 This effort has galvanized a community of scientists interested in forecasting, creating an organic testbed for improving both our technical understanding of how different forecast models perform and also how to integrate these models into decision-making.
 
-
+%We present a standardized comparison of a range of different forecasting models for influenza in the US, over multiple seasons.
 Building on the structure of the FluSight challenges (and those of other collaborative forecasting efforts \cite{Smith2017,Viboud2017}), a subset of FluSight participants formed a consortium in early 2017 to facilitate direct comparison and fusion of modeling approaches. 
-In this paper, we provide a detailed analysis of the performance of 22 different models from five different teams over the course of seven influenza seasons.
-Drawing on the different expertise of the five teams allows us to make fine-grained and standardized comparisons of distinct approaches to disease incidence forecasting that use different data sources and models.
+Our work brings together 22 models from five different institutions: Carnegie Mellon University, Columbia University, Los Alamos National Laboratory, University of Massachusetts-Amherst, and University of Texas-Austin.
+% While most groups developed more than one model for consideration, the models developed within a single group through the use of common data sources and/or methodologies often bear similarities to each other.
+% Having models from different teams enhances the diversity of the models presented.
+In this paper, we provide a detailed analysis of the performance of these different models in forecasting the targets defined by the CDC FluSight challenge organizers (Figure \ref{fig:intro-schematic}B).
+Drawing on the different expertise of the five teams allows us to make fine-grained and standardized comparisons of distinct approaches to disease incidence forecasting that use different data sources and modeling frameworks.
 
 In addition to analyzing comparative model performance over seasons, this work identifies key bottlenecks that limit the accuracy and generalizability of current forecasting efforts.
 Specifically, we present quantitative analyses of the impact that incomplete or partial case reporting has on forecast accuracy.
 Additionally, we assess whether purely statistical models show similar performance to models that consider explicit mechanistic models of disease transmission.
-
-
-\section{Methods}
-
-\subsection{FluSight Challenge Overview}
- 
-Detailed methodology and results from previous FluSight challenges have been published\cite{Biggerstaff2016,Biggerstaff2018}, and we summarize the key features of the challenge here.
-
-The FluSight challenge focuses on forecasts of the weighted percentage of doctor's office visits where the patient showed symptoms of an influenza-like illness in a particular region. Weighting is done by state population as the data are aggregated to the regional level.
-This is a standard measure of seasonal flu activity, for which public data is available for the US back to the 1997/1998 influenza season (Figure \ref{fig:intro-schematic}A). 
-During each influenza season, these data are updated each week by the CDC. When the most recent data are released, the prior weeks' reported wILI data may also be revised. 
-The unrevised data, available at a particular moment in time, is available via the DELPHI real-time epidemiological data API beginning in the 2014/2015 season.\cite{DELPHI} 
-This API enables researchers to ``turn back the clock'' to a particular moment in time and use the data available at that time. This tool facilitates more accurate assessment of how models would have performed in real-time. 
+[[NEED SNAPPY CLOSING SENTENCE HERE]]
 
 \begin{figure}[htbp]
 \begin{center}
@@ -300,225 +292,8 @@ fludata %>%
 @
 
 
-The FluSight challenges have defined seven forecasting targets of particular public health relevance. Three of these targets are fixed scalar values for a particular season: onset week, peak week, and peak intensity (i.e., the maximum observed wILI percentage). The remaining four targets are the observed wILI percentages in each of the subsequent four weeks (Figure \ref{fig:intro-schematic}B). 
-
-The FluSight challenges have also required that all forecast submissions follow a particular format. A single submission file (a comma-separated text file) contains the forecast made for a particular epidemic week (EW) of a season. Standard CDC definitions of epidemic week are used.\cite{NewMexicoDepartmentofHealth,Niemi2015,Tushar2018} Each file contains binned predictive distributions for seven specific targets across the 10 HHS regions of the US plus the national level. Each file contains over 8000 rows and typically is about 400KB in size.
-
-To be included in the model comparison presented here, previous participants in the CDC FluSight challenge were invited to provide out-of-sample forecasts for the 2010/2011 through 2016/2017 seasons.
-For each season, files were submitted for epidemic week 40 (EW40) of the first calendar year of the season through EW20 of the following calendar year. 
-(For seasons that contained an EW53, an additional file labeled EW53 was included.)
-For each model, this involved creating 233 separate forecast submission files, one for each of the weeks in the seven training seasons.
-Each forecast file represented a single submission file, as would be submitted to the CDC challenge. 
-Each team created their submitted forecasts in a prospective, out-of-sample fashion, i.e., fitting or training the model only on data available before the time of the forecast (see Figure \ref{fig:intro-schematic}). 
-Some data sources (e.g. wILI data prior to the 2014/2015 season) were not archived in a way that made data reliably retrievable in this ``real-time'' manner. In these situations, teams were still allowed to use these data sources with best efforts made to ensure forecasts were made using only data available at the time forecasts would have been made.
-
-\subsection{Summary of Models}
-
-Five teams each submitted between 1 and 9 separate models for evaluation (Table \ref{tab:model-list}). 
-A wide range of methodological approaches and modeling paradigms are included in the set of forecast models.
-For example, seven of the models utilize a compartmental structure (e.g. Susceptible-Infectious-Recovered), a model framework that directly encodes both the transmission and the susceptible-limiting dynamics of infectious disease outbreaks.
-Other less directly mechanistic models use statistical approaches to model the outbreak phenomenon directly by incorporating recent incidence and seasonal trends.
-One model, {\tt Delphi-Stat} is an ensemble model, a combination of other models from the Delphi team.
-Six models directly incorporate external data (i.e., not just the wILI measurements from the CDC ILINet dataset), including historical humidity data and Google search data.
-Two models stand out as being reference models. 
-One shared feature of these models is that they never change based on recent data. 
-The {\tt Delphi-Uniform} model always provides a forecast that assigns equal probability to all possible outcomes. 
-The {\tt ReichLab-KDE} model yields predictive distributions based entirely on data from other seasons using kernel density estimation (KDE) for seasonal targets and a generalized additive model with cyclic penalized splines for weekly incidence.
-Throughout the manuscript when we refer to the `historical baseline` model we mean the {\tt ReichLab-KDE} model.
-Because this model represents a prediction that essentially summarizes historical data, we consider this model an appropriate baseline model to reflect historical trends and refer to this model as the `historical baseline' through the remainder of the manuscript.
-Once submitted to the central repository, the models were not updated or modified except in four cases to fix explicit bugs in the code that yielded numerical problems with the forecasts. 
-(In all cases, the updates did not substantially change the performance of the updated models.)
-Re-fitting of models or tuning of model parameters was explicitly discouraged to avoid unintentional overfitting of models.
-
-\input{./static-figures/model-table.tex}
-
-\subsection{Metric Used for Evaluation and Comparison}
-
-Influenza forecasts have been evaluated by the CDC primarily using a variation of the log-score, a measure that enables evaluation of both the precision and accuracy of a forecast.\cite{Gneiting2007} 
-The log-score for a model $m$ is defined as $\log f_m(z^*|\bf{x})$ where $f_m(z|\bf{x})$ is the predicted density function from model $m$ for target $Z$, conditional on some data $\bf{x}$, $z^*$ is the observed value of the target $Z$, and $\log$ is the natural logarithm. 
-The log-score is a ``proper`` scoring rule, which has the practical implication that linear combinations (i.e., arithmetic means) of log scores will also be proper.\cite{Gneiting2007}
-
-Consistent with the primary evaluation performed by the CDC, we used a modified form of the log-score to evaluate forecasts. 
-The modified log-scores are computed for the targets on the wILI percentage scale such that predictions within $\pm$ 0.5 percentage points are considered accurate, i.e., modified log score = $\log \int_{z^* -.5}^{z^* + .5} f_m(z|{\bf{x}})dz$. 
-For the targets on the scale of epidemic weeks, predictions within $\pm$ 1 week are considered accurate, i.e., modified log score = $\log \int_{z^* -1}^{z^* + 1} f_m(z|{\bf{x}})dz$. 
-While this modification means that the resulting score is not formally a proper scoring rule, some have suggested that improper scores derived from proper scoring rules may, with large enough sample size, have negligible differences in practice.\cite{Gneiting2007} % see especially last paragraph of section 2.3 
-Additionally, this modified log score has the advantage of having a clear interpretation and was  motivated and designed by public health officials to reflect an accuracy of practical significance.
-Hereafter, we will refer to these modified log scores as simply log scores.
-
-Average log scores can be used to compare models' performance in forecasting for different locations, seasons, targets, or times of season.
-In practice, each model $m$ has a set of log scores associated with it that are region-, target-, season-, and week-specific.
-We represent one specific scalar log score value as $\log f_{m,r,t,s,w}(z^*|\bf{x})$. 
-These values can be averaged across any of the indices to create a summary measure of performance.
-For example,
-\begin{eqnarray}
-LS_{m,\cdot,t,\cdot,\cdot} & = & \frac{1}{N} \sum_{r,s,w} \log f_{m,r,t,s,w}(z^*|{\bf x})
-\end{eqnarray}
-represents a log score for model $m$ and target $t$ averaged across all regions, seasons and weeks.
-
-While log scores are not on a particularly interpretable scale, a simple transformation enhances interpretability substantially.
-Exponentiating an average log score yields a forecast score equivalent to the geometric mean of the probabilities assigned to the eventually observed outcome (that is, to bins eventually considered accurate). 
-The geometric mean is an alternative measure of central tendency to an arithmetic mean, representing the $N^{th}$ root of a product of $N$ numbers. 
-Using the example above, we then have that
-\begin{eqnarray}
-S_{m,\cdot,t,\cdot,\cdot} = \exp \left ( LS_{m,\cdot,t,\cdot,\cdot} \right ) & = & \exp \left ( \frac{1}{N} \sum_{r,s,w} \log f_{m,r,t,s,w}(z^*|{\bf x}) \right ) \\
- & = & \left ( \prod_{r,s,w}  f_{m,r,t,s,w}(z^*|{\bf x}) \right ) ^{1/N} 
-\end{eqnarray}
-In this setting, this score has the intuitive interpretation of being the average probability assigned to the true outcome (where average is considered to be a geometric average).
-Hereafter, we will refer to an average score as an exponentiated average log score.
-In all cases, we compute the averages arithmetically on the log scale and only exponentiate before reporting and interpreting a final number.
-Therefore, all reported average scores can be interpreted as the corresponding geometric means, or as the correponding average probabilities assigned to the true outcome.
-
-Following the convention of the CDC challenges, we only included certain weeks in the calculation of the average log scores for each target.
-This focuses model evaluation on periods of time that are more relevant for public health decision making.
-Forecasts of season onset are evaluated based on the forecasts that are received up to six weeks after the observed onset week within a given region.
-Peak week and peak intensity forecasts were scored for all weeks in a specific region-season up until the wILI measure drops below the regional baseline level for the final time. 
-(All weeks are scored if wILI never goes above the baseline.)
-%Forecasts of season peak and intensity are evaluated through the first forecast received after the weighted ILI goes below the regional baseline for the final time during a given region-season. 
-Week-ahead forecasts are evaluated using forecasts received four weeks prior to the onset week through forecasts received three weeks after the weighted ILI goes below the regional baseline for the final time.
-In a region-season without an onset, all weeks are scored.
-To ensure all calculated summary measures would be finite, all log scores with values of less than -10 were assigned the value -10, following CDC scoring conventions.
-All scores were based on ``ground truth'' values of wILI data obtained as of September 27, 2017.
-
-% \subsection{Formal comparisons of model performance}
-% 
-% Model-based comparisons of forecast accuracy are hindered by the high correlation of sequential forecasts and by outlying observations. 
-% When observations assign no probability to the eventually observed outcome they have a log-score of $-\infty$.
-
-%Things to confirm: removed weeks that CDC does not score, no onset seasons and multi peak years are handled appropriately
-
-\subsection{Specific model comparisons}\label{sec:delay-model}
-
-\subsubsection*{Analysis of data revisions}
-% from fig-delay-model-coefs.R
-% fm4 <- glm(exp(multi_bin_score) ~ Model + Target + factor(forecasted_epiweek) + bias_first_report_factor, data=scores_for_analysis)
-
-The CDC publicly releases data on doctor's office visits due to ILI each week. 
-These data for previous weeks (especially the most recent ones) are occasionally revised, due to new or updated data being reported to the CDC since their last publication.
-While often these revisions are fairly minor or non-existent, at other times, these revisions can be substantial, changing the reported wILI value by over 50\% of the originally reported value.
-Since these data are used by forecasters to generate current forecasts, these forecasts can be contaminated by the initially reported, biased data.
-
-We used a regression model to analyze the impact of these unrevised reports on forecasting.
-Specifically, for each region and epidemic week we calculated the difference between the first and the last reported ILI values for each epidemic week for which forecasts were generated in the seven seasons under consideration.
-We then created a categorical variable ($X$) with a binned representation of these differences using the following six categories covering the entire range of observed values: (-3.5,-2.5], (-2.5,-1.5], ..., (1.5,2.5].
-Using the forecasting results from the four most accurate individual non-ensemble models, ({\tt ReichLab-KCDE}, {\tt LANL-DBM}, {\tt Delphi-DeltaDensity1}, {\tt CU-EKF\_SIRS}), we then fit the following linear regression model
-\begin{equation}
-S_i = \beta + \alpha_{m(i)} + \gamma_{t(i)} + \lambda_{w(i)} + {\mathbf \theta}\cdot X_i + \epsilon_i
-\end{equation}
-where $S_i$ is the score, the index $i$ indexes a specific set of subscripts $\{m,r,t,s,w\}$, and the $\alpha_{m(i)}$, $\gamma_{t(i)}$, and $\lambda_{w(i)}$ are model-, target-, and week-specific fixed effects, respectively. (The notation $m(i)$ refers to the model contained in the $i$th observation of the dataset.) The error term is assumed to follow a Gaussian distribution with mean zero and an estimated variance parameter. 
-The parameter of interest in the model is the vector ${\mathbf \theta}$, which represents the average change in score based on the magnitude of the bias in the latest available wILI value, adjusted for differences based on model, target, and week-of-season.  The $[-0.5,+0.5]$ change category was taken as a baseline category and the corresponding $\theta$ entry constrained to be 0, so that other $\theta$ entries represent deviations from this baseline.
-%the expected changes in average score from the reference category (defined as the bin representing changes of between $\pm$ 0.5 from the original reported value) adjusting for model, target and week-of-season.
-
-\subsubsection*{Mechanistic vs. statistical models}
-
-% Infectious disease modeling has proven to be fertile ground for statisticians, mathematicians, and quantitative modelers for over a century. 
-There is not a consensus on a single best modeling approach or method for forecasting the dynamic patterns of infectious disease outbreaks in both endemic and emergent settings. 
-Semantically, modelers and forecasters often use a dichotomy of mechanistic vs. statistical (or `phenomenological') models to represent two different philosophical approaches to modeling.
-Mechanistic models for infectious disease consider the biological underpinnings of disease transmission, and in practice are implemented as variations on the Susceptible-Infectious-Recovered (SIR) model. 
-Statistical models largely ignore the biological underpinnings and theory of disease transmission and focus instead on using data-driven, empirical and statistical approaches to make the best forecasts possible of a given dataset, or phenomenon. 
-
-However, in practice, this dichotomized distinction is less clear than it is in theory.
-For example, statistical models for infectious disease counts may encode an autoregressive term for incidence (i.e., as done by the {\tt ReichLab-KCDE} model).
-This could be interpreted as representing a transmission process from one time period to another.
-In another example, the {\tt LANL-DBM} model has an explicit SIR compartmental model component but also leverages a hierarchical discrepancy which is purely statistical.
-The models from Columbia University used a statistical `now-casting' approach for their 1-week ahead forecasts, but after that relied on different variations of an SIR model.
-% Both approaches are commonly used and both have advantages and disadvantages in different settings.  
-
-We categorized models according to whether or not they had any explicit compartmental framework (Table \ref{tab:model-list}). 
-We then took the top three performing compartmental models (i.e., models with some kind of an underlying compartmental structure) and compared their performance with the top three individual component models without compartmental structure. 
-We excluded multi-model ensemble models (i.e., {\tt Delphi-Stat}) from this comparison.
-Separately for each target, we compared the average score of the top three compartmental models to the average score of the top three non-compartmental models.
-
-\subsection{Reproducibility and data availability}
-
-To maximize the reproducibility and data availability for this project, the data and code for the entire project (excluding specific model code) are publicly available.
-The project is available on GitHub\cite{fsngithub2018}, with a permanent repository [[stored on Zotero]].
-All of the forecasts may be interactively browsed on the website http://flusightnetwork.io.
-A web applet with interactive visualizations of the model evaluations is available at https://ermoore.shinyapps.io/FSN\_Model\_Comparison/. [[link to change]]
-Additionally, this manuscript was dynamically generated using \Sexpr{R.version.string}, Sweave, and knitr, tools for intermingling manuscript text with R code that run the central analyses, minimizing the chances for errors transcribing or translating results.\cite{Xie2015,RCore2017}.
-
 \section{Results}
 
-\subsection{Comparing models' forecasting performance by season}
-<<performance-by-season>>=
-kde_logscore <- unlist(scores_by_model[which(scores_by_model$Model=="ReichLab-KDE"), "avg_logscore"])
-max_logscore <- max(scores_by_model$avg_logscore)
-min_logscore <- min(scores_by_model$avg_logscore)
-max_logscore_model <- as.character(unlist(scores_by_model[which(scores_by_model$avg_logscore == max_logscore), "Model"]))
-min_logscore_model <- as.character(unlist(scores_by_model[which(scores_by_model$avg_logscore == min_logscore), "Model"]))
-n_above_kde <- sum(scores_by_model$avg_logscore>kde_logscore)
-n_models <- nrow(scores_by_model)
-n_seasons <- nrow(scores_by_season)
-n_regions <- nrow(scores_by_region)
-
-tmp <- scores_by_model_season %>% group_by(Model) %>%
-    summarize(n_seasons_above_maxavg = sum(avg_logscore>max_logscore)) 
-@
-
-
-Averaging across all targets and locations, forecast scores varied widely by model and season (Figure \ref{fig:results-model-season}). 
-The historical baseline model ({\tt ReichLab-KDE}) showed an average seasonal score of 
-\Sexpr{specify_decimal(exp(kde_logscore), 2)}, 
-meaning that in a typical season, across all targets and locations, this model assigned on average 
-\Sexpr{specify_decimal(exp(kde_logscore), 2)} 
-probability to the eventually observed value. 
-The model with the highest average seasonal forecast score 
-({\tt \Sexpr{sanitize(max_logscore_model)}}) 
-and lowest 
-({\tt \Sexpr{sanitize(min_logscore_model)}}) 
-had scores of \Sexpr{specify_decimal(exp(max_logscore), 2)} and \Sexpr{specify_decimal(exp(min_logscore), 2)}, respectively. 
-Of the \Sexpr{n_models} models, \Sexpr{n_above_kde} models 
-(\Sexpr{specify_decimal(n_above_kde/n_models*100)}\%) 
-showed higher average seasonal forecast score than the historical average.
-Season-to-season variation was substantial, with 
-\Sexpr{sum(tmp$n_seasons_above_maxavg>0)} 
-models having at least one season with greater average forecast score than the 
-{\tt \Sexpr{sanitize(max_logscore_model)}}
-model did.
-
-<<results-model-season, fig.height=5, fig.cap="Average forecast score, aggregated across targets, regions, and weeks, plotted separately for each model and season. Models are sorted from lowest scores (left) to highest scores (right). Higher scores indicate better performance. Dots show average scores across all targets, regions, and weeks within a given season. The `x' marks the geometric mean of the seven seasons. The names of compartmental models are shown in bold face. The {\\tt ReichLab-KDE} model (italicized red font) is considered the historical baseline model.">>=
-ggplot(scores_by_model_season, aes(x=Model, y=exp(avg_logscore))) +
-    geom_point(alpha=.5, aes(color=Season)) + 
-    geom_point(data=scores_by_model, shape="x", size=1, stroke=5)+
-    scale_color_brewer(palette="Dark2") +
-    ylab("average forecast score") +
-    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5, 
-        #family="Courier",
-        face=ifelse(
-            levels(scores_by_model_season$Model)%in% compartment_models,
-            "bold", 
-            ifelse(
-            levels(scores_by_model_season$Model) == "ReichLab-KDE", 
-            "italic",
-            "plain"
-        )
-        ),
-        color = ifelse(
-            levels(scores_by_model_season$Model) == "ReichLab-KDE", 
-            "red",
-            "black"
-        )
-        )) 
-
-@
-
-
-% \begin{figure}[htbp]
-% \begin{center}
-% \includegraphics[width=\textwidth]{figures/fig-results-model-season.pdf}
-% \caption{Average forecast score, aggregated across targets and plotted separately for each model and season. Models are sorted from least score (left) to most score (right). Dots show average score across all targets and regions for a given season. The x marks the geometric mean of the seven seasons. The names of compartmental models are shown in bold face. The {\\tt ReichLab-KDE} model can be thought of as the historical baseline model.}
-% \label{fig:results-model-season}
-% \end{center}
-% \end{figure}
-
-The six top-performing models utilized a range of methodologies, highlighting that very different approaches can result in very similar overall performance. 
-The overall best model was an ensemble model ({\tt Delphi-Stat}) that used a weighted combination of other models from the Delphi group.
-Both the {\tt ReichLab-KCDE} and the {\tt Delphi-DeltaDensity1} model utilized kernel conditional density estimation, a non-parametric statistical methodology that is a distribution-based variation on nearest-neighbors regression. 
-These models used different implementations and different input variables, but showed similarly strong performance across all seasons.
-The {\tt UTAustin-edm} and {\tt Delphi-DeltaDensity2} models also used variants of nearest-neighbors regression, although overall scores for these models were not consistent, indicating that implementation details and/or input variables can impact the performance of this approach.
-The {\tt LANL-DBM} and {\tt CU-EKF\_SIRS} models both rely on a compartmental model of influenza transmission; however the methodologies used to fit and forecast were different for these approaches.
-The CU model used an ensemble Kalman filter approach to generate forecasts, while the LANL model sampled from the posterior predictive distribution using Markov chain Monte Carlo (MCMC).
-The {\tt ReichLab-SARIMA2} model used a classical statistical time-series model, the seasonal auto-regressive integrated moving average model (SARIMA), to fit and generate forecasts. 
-Interestingly, several pairs of models, although having strongly contrasting methodological approaches, showed similar overall performance; e.g., {\tt CU-EKF\_SIRS} and {\tt ReichLab-SARIMA2}, {\tt LANL-DBM} and {\tt ReichLab-KCDE}.
 
 \subsection{Performance in forecasting week-ahead incidence}
 <<>>=
@@ -958,6 +733,48 @@ The historical baseline model showed
 score in forecasting peak intensity.
 Overall, \Sexpr{n_above_kde_pk} of \Sexpr{n_models} models (\Sexpr{round(n_above_kde_pk/n_models*100)}\%) had more forecast score than the historical baseline model in the scoring period of interest for peak intensity.
 
+<<seasons-above-baseline-plot, eval=FALSE, fig.height=4.5>>=
+dat <- scores_adj %>%
+  group_by(Model,
+           Target, 
+           Season) %>%
+  summarise(
+    avg_score = mean(score_adj),
+    Skill = exp(avg_score),
+    min_score = min(score_adj)
+  ) %>%
+  ungroup() %>%
+  mutate(Model = reorder(Model, avg_score))
+
+## these lines create a new column with the 'baseline' model score
+dat <- dat %>%   
+  group_by(Target, Season) %>%
+  mutate(
+    baseline_score = avg_score[Model=="ReichLab-KDE"]
+  ) %>%
+  ungroup() %>%
+  ## these lines then create the skill for the baseline model and a "% skill change over baseline" column
+  mutate(
+    baseline_skill = exp(baseline_score),
+    pct_diff_baseline_skill = (Skill - baseline_skill)/baseline_skill,
+    over_baseline = ifelse(pct_diff_baseline_skill > 0, 1, 0)
+  )
+
+dat <- dat %>% 
+  group_by(Model, Target) %>% 
+  mutate(seasons = sum(over_baseline == 1))
+
+palette <- colorRampPalette(c("white", "darkblue"))
+
+ggplot(dat, aes(x=Target, y=Model, fill=factor(seasons))) +
+  geom_tile(color = "gray90") + ylab(NULL) + xlab(NULL) +
+  scale_fill_brewer(palette(7)) + 
+  theme_minimal() + 
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  guides(fill = guide_legend(title = "Seasons \nAbove \nBaseline", reverse = T)) + 
+  coord_fixed(ratio = 0.7)
+@
+
 % \subsection{Performance of models by location}
 % 
 % Consider adding map that averages across all models, or shows max skill per region, as it varies quite a bit.
@@ -970,6 +787,85 @@ Overall, \Sexpr{n_above_kde_pk} of \Sexpr{n_models} models (\Sexpr{round(n_above
 % Hypothesis: year-to-year variation within a region (as measured by skill -> lower skill = larger variation) is associated with skill of (top) models for that region. 
 % lower skill = larger variation --> lower component model skill
 
+\subsection{Comparing models' forecasting performance by season}
+<<performance-by-season>>=
+kde_logscore <- unlist(scores_by_model[which(scores_by_model$Model=="ReichLab-KDE"), "avg_logscore"])
+max_logscore <- max(scores_by_model$avg_logscore)
+min_logscore <- min(scores_by_model$avg_logscore)
+max_logscore_model <- as.character(unlist(scores_by_model[which(scores_by_model$avg_logscore == max_logscore), "Model"]))
+min_logscore_model <- as.character(unlist(scores_by_model[which(scores_by_model$avg_logscore == min_logscore), "Model"]))
+n_above_kde <- sum(scores_by_model$avg_logscore>kde_logscore)
+
+tmp <- scores_by_model_season %>% group_by(Model) %>%
+    summarize(n_seasons_above_maxavg = sum(avg_logscore>max_logscore)) 
+@
+
+
+Averaging across all targets and locations, forecast scores varied widely by model and season (Figure \ref{fig:results-model-season}). 
+The historical baseline model ({\tt ReichLab-KDE}) showed an average seasonal score of 
+\Sexpr{specify_decimal(exp(kde_logscore), 2)}, 
+meaning that in a typical season, across all targets and locations, this model assigned on average 
+\Sexpr{specify_decimal(exp(kde_logscore), 2)} 
+probability to the eventually observed value. 
+The model with the highest average seasonal forecast score 
+({\tt \Sexpr{sanitize(max_logscore_model)}}) 
+and lowest 
+({\tt \Sexpr{sanitize(min_logscore_model)}}) 
+had scores of \Sexpr{specify_decimal(exp(max_logscore), 2)} and \Sexpr{specify_decimal(exp(min_logscore), 2)}, respectively. 
+Of the \Sexpr{n_models} models, \Sexpr{n_above_kde} models 
+(\Sexpr{specify_decimal(n_above_kde/n_models*100)}\%) 
+showed higher average seasonal forecast score than the historical average.
+Season-to-season variation was substantial, with 
+\Sexpr{sum(tmp$n_seasons_above_maxavg>0)} 
+models having at least one season with greater average forecast score than the 
+{\tt \Sexpr{sanitize(max_logscore_model)}}
+model did.
+
+<<results-model-season, fig.height=5, fig.cap="Average forecast score, aggregated across targets, regions, and weeks, plotted separately for each model and season. Models are sorted from lowest scores (left) to highest scores (right). Higher scores indicate better performance. Dots show average scores across all targets, regions, and weeks within a given season. The `x' marks the geometric mean of the seven seasons. The names of compartmental models are shown in bold face. The {\\tt ReichLab-KDE} model (italicized red font) is considered the historical baseline model.">>=
+ggplot(scores_by_model_season, aes(x=Model, y=exp(avg_logscore))) +
+    geom_point(alpha=.5, aes(color=Season)) + 
+    geom_point(data=scores_by_model, shape="x", size=1, stroke=5)+
+    scale_color_brewer(palette="Dark2") +
+    ylab("average forecast score") +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5, 
+        #family="Courier",
+        face=ifelse(
+            levels(scores_by_model_season$Model)%in% compartment_models,
+            "bold", 
+            ifelse(
+            levels(scores_by_model_season$Model) == "ReichLab-KDE", 
+            "italic",
+            "plain"
+        )
+        ),
+        color = ifelse(
+            levels(scores_by_model_season$Model) == "ReichLab-KDE", 
+            "red",
+            "black"
+        )
+        )) 
+
+@
+
+
+% \begin{figure}[htbp]
+% \begin{center}
+% \includegraphics[width=\textwidth]{figures/fig-results-model-season.pdf}
+% \caption{Average forecast score, aggregated across targets and plotted separately for each model and season. Models are sorted from least score (left) to most score (right). Dots show average score across all targets and regions for a given season. The x marks the geometric mean of the seven seasons. The names of compartmental models are shown in bold face. The {\\tt ReichLab-KDE} model can be thought of as the historical baseline model.}
+% \label{fig:results-model-season}
+% \end{center}
+% \end{figure}
+
+The six top-performing models utilized a range of methodologies, highlighting that very different approaches can result in very similar overall performance. 
+The overall best model was an ensemble model ({\tt Delphi-Stat}) that used a weighted combination of other models from the Delphi group.
+Both the {\tt ReichLab-KCDE} and the {\tt Delphi-DeltaDensity1} model utilized kernel conditional density estimation, a non-parametric statistical methodology that is a distribution-based variation on nearest-neighbors regression. 
+These models used different implementations and different input variables, but showed similarly strong performance across all seasons.
+The {\tt UTAustin-edm} and {\tt Delphi-DeltaDensity2} models also used variants of nearest-neighbors regression, although overall scores for these models were not consistent, indicating that implementation details and/or input variables can impact the performance of this approach.
+The {\tt LANL-DBM} and {\tt CU-EKF\_SIRS} models both rely on a compartmental model of influenza transmission; however the methodologies used to fit and forecast were different for these approaches.
+The CU model used an ensemble Kalman filter approach to generate forecasts, while the LANL model sampled from the posterior predictive distribution using Markov chain Monte Carlo (MCMC).
+The {\tt ReichLab-SARIMA2} model used a classical statistical time-series model, the seasonal auto-regressive integrated moving average model (SARIMA), to fit and generate forecasts. 
+Interestingly, several pairs of models, although having strongly contrasting methodological approaches, showed similar overall performance; e.g., {\tt CU-EKF\_SIRS} and {\tt ReichLab-SARIMA2}, {\tt LANL-DBM} and {\tt ReichLab-KCDE}.
+
 
 \subsection{Comparison between statistical and compartmental models} \label{sec:stat-mech}
 
@@ -1055,7 +951,7 @@ Therefore, the only compartmental model making 1 week-ahead forecasts was the {\
 %We identify and quantify several of these challenges, including revisions to initially reported data, ....
 
 \subsection{Delayed case reporting impacts forecast score}\label{sec:delays}
-<<delay-analysis, fig.height=4, fig.cap="Model-estimated changes in forecast skill due to bias in initial reports of wILI \\%. The figure shows estimated coefficient values (and 95\\% confidence intervals) from a multivariable linear regression using model, week-of-year, target, and a categorized version of the bias in the first reported wILI \\% to predict forecast skill (Section \\ref{sec:delay-model}).  The x-axis labels show the range of bias (e.g. ``(-0.5,0.5]'' represents all observations whose first observations were within $\pm$ 0.5 percentage points of the final reported value). Values to the left of the dashed grey line are observations whose first reported value were lower than the final. Y-axis values of less than zero (the reference category) represent decreases in expected forecast skill. The total number of observations in each category are shown above the x-axis labels.">>=
+<<delay-analysis, fig.height=4, fig.cap="Model-estimated changes in forecast skill due to bias in initial reports of wILI \\%. The figure shows estimated coefficient values (and 95\\% confidence intervals) from a multivariable linear regression using model, week-of-year, target, and a categorized version of the bias in the first reported wILI \\% to predict forecast skill (Section \\ref{sec:delay-model}).  The x-axis labels show the range of bias (e.g. ``(-0.5,0.5]'' represents all observations whose first observations were within $\\pm$ 0.5 percentage points of the final reported value). Values to the left of the dashed grey line are observations whose first reported value were lower than the final. Y-axis values of less than zero (the reference category) represent decreases in expected forecast skill. The total number of observations in each category are shown above the x-axis labels.">>=
 fm4_coefs_fltr <- readRDS(file="./data/delay-model-coefs.rds")
 lagged_truth <- readRDS(file = "./data/lagged_truth.rds")
 
@@ -1154,6 +1050,156 @@ Public health officials are still learning how to best integrate forecasts into
 Close collaboration between public health policy-makers and quantitative modelers is necessary to ensure the forecasts have maximum impact and are appropriately communicated to the public and the broader public health community. 
 Real-time implementation and testing of forecasting methods is helpful for planning and assessing what targets should be forecasted for maximum public health impact.
 
+\section{Methods}
+
+\subsection{FluSight Challenge Overview}
+ 
+Detailed methodology and results from previous FluSight challenges have been published\cite{Biggerstaff2016,Biggerstaff2018}, and we summarize the key features of the challenge here.
+
+The FluSight challenge focuses on forecasts of the weighted percentage of doctor's office visits where the patient showed symptoms of an influenza-like illness in a particular region. Weighting is done by state population as the data are aggregated to the regional level.
+This is a standard measure of seasonal flu activity, for which public data is available for the US back to the 1997/1998 influenza season (Figure \ref{fig:intro-schematic}A). 
+During each influenza season, these data are updated each week by the CDC. When the most recent data are released, the prior weeks' reported wILI data may also be revised. 
+The unrevised data, available at a particular moment in time, is available via the DELPHI real-time epidemiological data API beginning in the 2014/2015 season.\cite{DELPHI} 
+This API enables researchers to ``turn back the clock'' to a particular moment in time and use the data available at that time. This tool facilitates more accurate assessment of how models would have performed in real-time. 
+
+
+The FluSight challenges have defined seven forecasting targets of particular public health relevance. Three of these targets are fixed scalar values for a particular season: onset week, peak week, and peak intensity (i.e., the maximum observed wILI percentage). The remaining four targets are the observed wILI percentages in each of the subsequent four weeks (Figure \ref{fig:intro-schematic}B). 
+
+The FluSight challenges have also required that all forecast submissions follow a particular format. A single submission file (a comma-separated text file) contains the forecast made for a particular epidemic week (EW) of a season. Standard CDC definitions of epidemic week are used.\cite{NewMexicoDepartmentofHealth,Niemi2015,Tushar2018} Each file contains binned predictive distributions for seven specific targets across the 10 HHS regions of the US plus the national level. Each file contains over 8000 rows and typically is about 400KB in size.
+
+To be included in the model comparison presented here, previous participants in the CDC FluSight challenge were invited to provide out-of-sample forecasts for the 2010/2011 through 2016/2017 seasons.
+For each season, files were submitted for epidemic week 40 (EW40) of the first calendar year of the season through EW20 of the following calendar year. 
+(For seasons that contained an EW53, an additional file labeled EW53 was included.)
+For each model, this involved creating 233 separate forecast submission files, one for each of the weeks in the seven training seasons.
+Each forecast file represented a single submission file, as would be submitted to the CDC challenge. 
+Each team created their submitted forecasts in a prospective, out-of-sample fashion, i.e., fitting or training the model only on data available before the time of the forecast (see Figure \ref{fig:intro-schematic}). 
+Some data sources (e.g. wILI data prior to the 2014/2015 season) were not archived in a way that made data reliably retrievable in this ``real-time'' manner. In these situations, teams were still allowed to use these data sources with best efforts made to ensure forecasts were made using only data available at the time forecasts would have been made.
+
+\subsection{Summary of Models}
+
+Five teams each submitted between 1 and 9 separate models for evaluation (Table \ref{tab:model-list}). 
+A wide range of methodological approaches and modeling paradigms are included in the set of forecast models.
+For example, seven of the models utilize a compartmental structure (e.g. Susceptible-Infectious-Recovered), a model framework that directly encodes both the transmission and the susceptible-limiting dynamics of infectious disease outbreaks.
+Other less directly mechanistic models use statistical approaches to model the outbreak phenomenon directly by incorporating recent incidence and seasonal trends.
+One model, {\tt Delphi-Stat} is an ensemble model, a combination of other models from the Delphi team.
+Six models directly incorporate external data (i.e., not just the wILI measurements from the CDC ILINet dataset), including historical humidity data and Google search data.
+Two models stand out as being reference models. 
+One shared feature of these models is that they never change based on recent data. 
+The {\tt Delphi-Uniform} model always provides a forecast that assigns equal probability to all possible outcomes. 
+The {\tt ReichLab-KDE} model yields predictive distributions based entirely on data from other seasons using kernel density estimation (KDE) for seasonal targets and a generalized additive model with cyclic penalized splines for weekly incidence.
+Throughout the manuscript when we refer to the `historical baseline` model we mean the {\tt ReichLab-KDE} model.
+Because this model represents a prediction that essentially summarizes historical data, we consider this model an appropriate baseline model to reflect historical trends and refer to this model as the `historical baseline' through the remainder of the manuscript.
+Once submitted to the central repository, the models were not updated or modified except in four cases to fix explicit bugs in the code that yielded numerical problems with the forecasts. 
+(In all cases, the updates did not substantially change the performance of the updated models.)
+Re-fitting of models or tuning of model parameters was explicitly discouraged to avoid unintentional overfitting of models.
+
+\input{./static-figures/model-table.tex}
+
+\subsection{Metric Used for Evaluation and Comparison}
+
+Influenza forecasts have been evaluated by the CDC primarily using a variation of the log-score, a measure that enables evaluation of both the precision and accuracy of a forecast.\cite{Gneiting2007} 
+The log-score for a model $m$ is defined as $\log f_m(z^*|\bf{x})$ where $f_m(z|\bf{x})$ is the predicted density function from model $m$ for target $Z$, conditional on some data $\bf{x}$, $z^*$ is the observed value of the target $Z$, and $\log$ is the natural logarithm. 
+The log-score is a ``proper`` scoring rule, which has the practical implication that linear combinations (i.e., arithmetic means) of log scores will also be proper.\cite{Gneiting2007}
+
+Consistent with the primary evaluation performed by the CDC, we used a modified form of the log-score to evaluate forecasts. 
+The modified log-scores are computed for the targets on the wILI percentage scale such that predictions within $\pm$ 0.5 percentage points are considered accurate, i.e., modified log score = $\log \int_{z^* -.5}^{z^* + .5} f_m(z|{\bf{x}})dz$. 
+For the targets on the scale of epidemic weeks, predictions within $\pm$ 1 week are considered accurate, i.e., modified log score = $\log \int_{z^* -1}^{z^* + 1} f_m(z|{\bf{x}})dz$. 
+While this modification means that the resulting score is not formally a proper scoring rule, some have suggested that improper scores derived from proper scoring rules may, with large enough sample size, have negligible differences in practice.\cite{Gneiting2007} % see especially last paragraph of section 2.3 
+Additionally, this modified log score has the advantage of having a clear interpretation and was  motivated and designed by public health officials to reflect an accuracy of practical significance.
+Hereafter, we will refer to these modified log scores as simply log scores.
+
+Average log scores can be used to compare models' performance in forecasting for different locations, seasons, targets, or times of season.
+In practice, each model $m$ has a set of log scores associated with it that are region-, target-, season-, and week-specific.
+We represent one specific scalar log score value as $\log f_{m,r,t,s,w}(z^*|\bf{x})$. 
+These values can be averaged across any of the indices to create a summary measure of performance.
+For example,
+\begin{eqnarray}
+LS_{m,\cdot,t,\cdot,\cdot} & = & \frac{1}{N} \sum_{r,s,w} \log f_{m,r,t,s,w}(z^*|{\bf x})
+\end{eqnarray}
+represents a log score for model $m$ and target $t$ averaged across all regions, seasons and weeks.
+
+While log scores are not on a particularly interpretable scale, a simple transformation enhances interpretability substantially.
+Exponentiating an average log score yields a forecast score equivalent to the geometric mean of the probabilities assigned to the eventually observed outcome (that is, to bins eventually considered accurate). 
+The geometric mean is an alternative measure of central tendency to an arithmetic mean, representing the $N^{th}$ root of a product of $N$ numbers. 
+Using the example above, we then have that
+\begin{eqnarray}
+S_{m,\cdot,t,\cdot,\cdot} = \exp \left ( LS_{m,\cdot,t,\cdot,\cdot} \right ) & = & \exp \left ( \frac{1}{N} \sum_{r,s,w} \log f_{m,r,t,s,w}(z^*|{\bf x}) \right ) \\
+ & = & \left ( \prod_{r,s,w}  f_{m,r,t,s,w}(z^*|{\bf x}) \right ) ^{1/N} 
+\end{eqnarray}
+In this setting, this score has the intuitive interpretation of being the average probability assigned to the true outcome (where average is considered to be a geometric average).
+Hereafter, we will refer to an average score as an exponentiated average log score.
+In all cases, we compute the averages arithmetically on the log scale and only exponentiate before reporting and interpreting a final number.
+Therefore, all reported average scores can be interpreted as the corresponding geometric means, or as the correponding average probabilities assigned to the true outcome.
+
+Following the convention of the CDC challenges, we only included certain weeks in the calculation of the average log scores for each target.
+This focuses model evaluation on periods of time that are more relevant for public health decision making.
+Forecasts of season onset are evaluated based on the forecasts that are received up to six weeks after the observed onset week within a given region.
+Peak week and peak intensity forecasts were scored for all weeks in a specific region-season up until the wILI measure drops below the regional baseline level for the final time. 
+(All weeks are scored if wILI never goes above the baseline.)
+%Forecasts of season peak and intensity are evaluated through the first forecast received after the weighted ILI goes below the regional baseline for the final time during a given region-season. 
+Week-ahead forecasts are evaluated using forecasts received four weeks prior to the onset week through forecasts received three weeks after the weighted ILI goes below the regional baseline for the final time.
+In a region-season without an onset, all weeks are scored.
+To ensure all calculated summary measures would be finite, all log scores with values of less than -10 were assigned the value -10, following CDC scoring conventions.
+All scores were based on ``ground truth'' values of wILI data obtained as of September 27, 2017.
+
+% \subsection{Formal comparisons of model performance}
+% 
+% Model-based comparisons of forecast accuracy are hindered by the high correlation of sequential forecasts and by outlying observations. 
+% When observations assign no probability to the eventually observed outcome they have a log-score of $-\infty$.
+
+%Things to confirm: removed weeks that CDC does not score, no onset seasons and multi peak years are handled appropriately
+
+\subsection{Specific model comparisons}\label{sec:delay-model}
+
+\subsubsection*{Analysis of data revisions}
+% from fig-delay-model-coefs.R
+% fm4 <- glm(exp(multi_bin_score) ~ Model + Target + factor(forecasted_epiweek) + bias_first_report_factor, data=scores_for_analysis)
+
+The CDC publicly releases data on doctor's office visits due to ILI each week. 
+These data for previous weeks (especially the most recent ones) are occasionally revised, due to new or updated data being reported to the CDC since their last publication.
+While often these revisions are fairly minor or non-existent, at other times, these revisions can be substantial, changing the reported wILI value by over 50\% of the originally reported value.
+Since these data are used by forecasters to generate current forecasts, these forecasts can be contaminated by the initially reported, biased data.
+
+We used a regression model to analyze the impact of these unrevised reports on forecasting.
+Specifically, for each region and epidemic week we calculated the difference between the first and the last reported ILI values for each epidemic week for which forecasts were generated in the seven seasons under consideration.
+We then created a categorical variable ($X$) with a binned representation of these differences using the following six categories covering the entire range of observed values: (-3.5,-2.5], (-2.5,-1.5], ..., (1.5,2.5].
+Using the forecasting results from the four most accurate individual non-ensemble models, ({\tt ReichLab-KCDE}, {\tt LANL-DBM}, {\tt Delphi-DeltaDensity1}, {\tt CU-EKF\_SIRS}), we then fit the following linear regression model
+\begin{equation}
+S_i = \beta + \alpha_{m(i)} + \gamma_{t(i)} + \lambda_{w(i)} + {\mathbf \theta}\cdot X_i + \epsilon_i
+\end{equation}
+where $S_i$ is the score, the index $i$ indexes a specific set of subscripts $\{m,r,t,s,w\}$, and the $\alpha_{m(i)}$, $\gamma_{t(i)}$, and $\lambda_{w(i)}$ are model-, target-, and week-specific fixed effects, respectively. (The notation $m(i)$ refers to the model contained in the $i$th observation of the dataset.) The error term is assumed to follow a Gaussian distribution with mean zero and an estimated variance parameter. 
+The parameter of interest in the model is the vector ${\mathbf \theta}$, which represents the average change in score based on the magnitude of the bias in the latest available wILI value, adjusted for differences based on model, target, and week-of-season.  The $[-0.5,+0.5]$ change category was taken as a baseline category and the corresponding $\theta$ entry constrained to be 0, so that other $\theta$ entries represent deviations from this baseline.
+%the expected changes in average score from the reference category (defined as the bin representing changes of between $\pm$ 0.5 from the original reported value) adjusting for model, target and week-of-season.
+
+\subsubsection*{Mechanistic vs. statistical models}
+
+% Infectious disease modeling has proven to be fertile ground for statisticians, mathematicians, and quantitative modelers for over a century. 
+There is not a consensus on a single best modeling approach or method for forecasting the dynamic patterns of infectious disease outbreaks in both endemic and emergent settings. 
+Semantically, modelers and forecasters often use a dichotomy of mechanistic vs. statistical (or `phenomenological') models to represent two different philosophical approaches to modeling.
+Mechanistic models for infectious disease consider the biological underpinnings of disease transmission, and in practice are implemented as variations on the Susceptible-Infectious-Recovered (SIR) model. 
+Statistical models largely ignore the biological underpinnings and theory of disease transmission and focus instead on using data-driven, empirical and statistical approaches to make the best forecasts possible of a given dataset, or phenomenon. 
+
+However, in practice, this dichotomized distinction is less clear than it is in theory.
+For example, statistical models for infectious disease counts may encode an autoregressive term for incidence (i.e., as done by the {\tt ReichLab-KCDE} model).
+This could be interpreted as representing a transmission process from one time period to another.
+In another example, the {\tt LANL-DBM} model has an explicit SIR compartmental model component but also leverages a hierarchical discrepancy which is purely statistical.
+The models from Columbia University used a statistical `now-casting' approach for their 1-week ahead forecasts, but after that relied on different variations of an SIR model.
+% Both approaches are commonly used and both have advantages and disadvantages in different settings.  
+
+We categorized models according to whether or not they had any explicit compartmental framework (Table \ref{tab:model-list}). 
+We then took the top three performing compartmental models (i.e., models with some kind of an underlying compartmental structure) and compared their performance with the top three individual component models without compartmental structure. 
+We excluded multi-model ensemble models (i.e., {\tt Delphi-Stat}) from this comparison.
+Separately for each target, we compared the average score of the top three compartmental models to the average score of the top three non-compartmental models.
+
+\subsection{Reproducibility and data availability}
+
+To maximize the reproducibility and data availability for this project, the data and code for the entire project (excluding specific model code) are publicly available.
+The project is available on GitHub\cite{fsngithub2018}, with a permanent repository [[stored on Zotero]].
+All of the forecasts may be interactively browsed on the website http://flusightnetwork.io.
+A web applet with interactive visualizations of the model evaluations is available at https://ermoore.shinyapps.io/FSN\_Model\_Comparison/. [[link to change]]
+Additionally, this manuscript was dynamically generated using \Sexpr{R.version.string}, Sweave, and knitr, tools for intermingling manuscript text with R code that run the central analyses, minimizing the chances for errors transcribing or translating results.\cite{Xie2015,RCore2017}.
+
+
 \bibliographystyle{unsrt}
 \bibliography{../flusightnetwork}
 
diff --git a/writing/comparison/static-figures/model-table.tex b/writing/comparison/static-figures/model-table.tex
index cfbe63aa..9402ede8 100644
--- a/writing/comparison/static-figures/model-table.tex
+++ b/writing/comparison/static-figures/model-table.tex
@@ -1,37 +1,37 @@
 \begin{table}
 \setlength{\tabcolsep}{4pt} 
-\begin{tabular}{p{1.69cm} l p{7.5cm}  p{1.1cm}  p{1.1cm} p{1.1cm}}
+\begin{tabular}{p{1.6cm} l p{7.5cm} l  p{1cm}  p{1cm} p{1cm}}
 \hline
-Team     & Model Abbr& Model Description & Ext. Data & Comp. Model* & Ens. Model \\ 
+Team     & Model Abbr& Model Description & & Ext. Data & Mech. Model & Ens. Model \\ 
 \hline
-CU       & EAKFC\_SEIRS       & Ensemble Adjustment Kalman Filter SEIRS  & x & x & \\ 
+CU       & EAKFC\_SEIRS       & Ensemble Adjustment Kalman Filter SEIRS & \cite{Pei2018}  & x & x & \\ 
 
-~        & EAKFC\_SIRS        & Ensemble Adjustment Kalman Filter SIRS   & x & x & \\
-~        & EKF\_SEIRS         & Ensemble Kalman Filter SEIRS    & x             & x &                    \\
-~        & EKF\_SIRS          & Ensemble Kalman Filter SIRS     & x             & x   &                 \\
-~        & RHF\_SEIRS         & Rank Histogram Filter SEIRS     & x             & x     &               \\
-~        & RHF\_SIRS          & Rank Histogram Filter SIRS      & x             & x       &             \\
-~        & BMA                & Bayesian Model Averaging       & ~             & ~          & x         \\
+~        & EAKFC\_SIRS        & Ensemble Adjustment Kalman Filter SIRS & \cite{Pei2018}  & x & x & \\
+~        & EKF\_SEIRS         & Ensemble Kalman Filter SEIRS & \cite{Yang2017}    & x             & x &                    \\
+~        & EKF\_SIRS          & Ensemble Kalman Filter SIRS & \cite{Yang2017}     & x             & x   &                 \\
+~        & RHF\_SEIRS         & Rank Histogram Filter SEIRS & \cite{Yang2017}    & x             & x     &               \\
+~        & RHF\_SIRS          & Rank Histogram Filter SIRS & \cite{Yang2017}     & x             & x       &             \\
+~        & BMA                & Bayesian Model Averaging & \cite{Yamana2017}      & ~             & ~          &          \\
 \hline
-Delphi   & BasisRegression    & Basis Regression (epiforecast package defaults) & ~             & ~     &               \\ 
-~        & DeltaDensity1      & Delta Density (epiforecast package defaults)    & ~             & ~       &             \\ 
-~        & EmpiricalBayes1    & Empirical Bayes (conditioning on past four weeks only) & ~             & ~  &                  \\ 
-~        & EmpiricalBayes2    & Empirical Bayes (epiforecast package defaults) & ~             & ~            &        \\ 
-~        & EmpiricalFuture    & Empirical Futures (epiforecast package defaults)    & ~             & ~         &           \\ 
-~        & EmpiricalTraj      & Empirical Trajectories (epiforecast package defaults) & ~             & ~         &           \\ 
-~        & DeltaDensity2      & Markovian Delta Density (epiforecast package defaults) & ~             & ~          &          \\ 
-~        & Uniform            & Uniform Distribution  & ~             & ~   &                 \\ 
-~        & Stat               & Statistical Ensemble (combining predictions from the other 8 models submitted by Delphi) & ~             & ~  & x                 \\
+Delphi   & BasisRegression    & Basis Regression ({\tt epiforecast} defaults) & \cite{Brooks2015a} & ~             & ~     &               \\ 
+~        & DeltaDensity1      & Delta Density ({\tt epiforecast} defaults)   & \cite{Brooks2015a} & ~             & ~       &             \\ 
+~        & EmpiricalBayes1    & Empirical Bayes (conditioning on past four weeks) & \cite{Brooks2015,Brooks2015a} & ~             & ~  &                  \\ 
+~        & EmpiricalBayes2    & Empirical Bayes ({\tt epiforecast} defaults) & \cite{Brooks2015,Brooks2015a} & ~             & ~            &        \\ 
+~        & EmpiricalFuture    & Empirical Futures ({\tt epiforecast} defaults) &  \cite{Brooks2015a} & ~             & ~         &           \\ 
+~        & EmpiricalTraj      & Empirical Trajectories ({\tt epiforecast} defaults)& \cite{Brooks2015a} & ~             & ~         &           \\ 
+~        & DeltaDensity2      & Markovian Delta Density ({\tt epiforecast} defaults)& \cite{Brooks2015a} & ~             & ~          &          \\ 
+~        & Uniform            & Uniform Distribution&  & ~             & ~   &                 \\ 
+~        & Stat               & Statistical Ensemble (combining predictions from the other 8 models submitted by Delphi)& & ~             & ~  & x                 \\
 \hline
-LANL     & DBM                & Dynamic Bayesian SIR Model with a hierarchical discrepancy & ~             & x      &              \\ 
+LANL     & DBM                & Dynamic Bayesian SIR Model with a hierarchical discrepancy & \cite{Osthus2017} & ~             & x      &              \\ 
 \hline
-ReichLab & KCDE               & Kernel Conditional Density Estimation using recent observations and seasonality  & ~             & ~            &        \\ 
-~        & KDE                & Kernel Density Estimation (seasonal targets) and cyclic penalized splines (week-ahead targets)  & ~             & ~     &               \\ 
-~        & SARIMA1            & SARIMA model without seasonal differencing & ~             & ~      &              \\ 
-~        & SARIMA2            & SARIMA model with seasonal differencing & ~             & ~           &         \\ 
+ReichLab & KCDE               & Kernel Conditional Density Estimation using recent observations and seasonality & \cite{Ray2017}  & ~             & ~            &        \\ 
+~        & KDE                & Kernel Density Estimation (seasonal targets) and cyclic penalized splines (week-ahead targets) & \cite{Ray2018}  & ~             & ~     &               \\ 
+~        & SARIMA1            & SARIMA model without seasonal differencing & \cite{Ray2018} & ~             & ~      &              \\ 
+~        & SARIMA2            & SARIMA model with seasonal differencing & \cite{Ray2018} & ~             & ~           &         \\ 
 \hline
-UTAustin & EDM                & Empirical Dynamic Model, or topological method of analogues & ~             & ~         &           \\ 
+UTAustin & EDM                & Empirical Dynamic Model, or topological method of analogues & & ~             & ~         &           \\ 
 \end{tabular}
-\caption{List of models, with key characteristics. Team abbreviations are translated as: CU = Columbia University (PI: Shaman), Delphi = Carnegie Mellon (PI: Rosenfeld), LANL = Los Alamos National Laboratories (PI: Osthus), ReichLab = University of Massachusetts Amherst (PI: Reich), UTAustin = University of Texas Austin (PI: Lauren Meyers).  The `Ext data' column notes models that use data external to the ILI Net data from CDC. The `Comp. model' column notes models that rely to some extent on an compartmental model formulation. The `Ens. model' column notes models that are ensemble models.}
+\caption{List of models, with key characteristics. Team abbreviations are translated as: CU = Columbia University (PI: Shaman), Delphi = Carnegie Mellon (PI: Rosenfeld), LANL = Los Alamos National Laboratories (PI: Osthus), ReichLab = University of Massachusetts Amherst (PI: Reich), UTAustin = University of Texas Austin (PI: Lauren Meyers).  The `Ext data' column notes models that use data external to the ILINet data from CDC. The `Mech. model' column notes models that rely to some extent on an mechanistic or compartmental model formulation. The `Ens. model' column notes models that are ensemble models.}
 \label{tab:model-list}
 \end{table}
\ No newline at end of file
diff --git a/writing/flusightnetwork.bib b/writing/flusightnetwork.bib
index 5b68f892..ae224662 100644
--- a/writing/flusightnetwork.bib
+++ b/writing/flusightnetwork.bib
@@ -613,3 +613,56 @@ @Book{Xie2015
     note = {ISBN 978-1498716963},
     url = {https://yihui.name/knitr/},
 }
+@manual{Brooks2015a,
+annote = {R package version 0.0.1},
+author = {Brooks, Logan C and Farrow, David C and Hyun, Sangwon and Tibshirani, Ryan J and Rosenfeld, Roni},
+title = {{epiforecast: Tools for forecasting semi-regular seasonal epidemic curves and similar time series}},
+year = {2015}
+}
+@article{Ray2018,
+abstract = {Accurate and reliable predictions of infectious disease dynamics can be valuable to public health organizations that plan interventions to decrease or prevent disease transmission. A great variety of models have been developed for this task, using different model structures, covariates, and targets for prediction. Experience has shown that the performance of these models varies; some tend to do better or worse in different seasons or at different points within a season. Ensemble methods combine multiple models to obtain a single prediction that leverages the strengths of each model. We considered a range of ensemble methods that each form a predictive density for a target of interest as a weighted sum of the predictive densities from component models. In the simplest case, equal weight is assigned to each component model; in the most complex case, the weights vary with the region, prediction target, week of the season when the predictions are made, a measure of component model uncertainty, and recent observations of disease incidence. We applied these methods to predict measures of influenza season timing and severity in the United States, both at the national and regional levels, using three component models. We trained the models on retrospective predictions from 14 seasons (1997/1998–2010/2011) and evaluated each model's prospective, out-of-sample performance in the five subsequent influenza seasons. In this test phase, the ensemble methods showed average performance that was similar to the best of the component models, but offered more consistent performance across seasons than the component models. Ensemble methods offer the potential to deliver more reliable predictions to public health decision makers.},
+author = {Ray, Evan L. and Reich, Nicholas G.},
+doi = {10.1371/journal.pcbi.1005910},
+editor = {Viboud, Cecile},
+issn = {1553-7358},
+journal = {PLOS Computational Biology},
+month = {feb},
+number = {2},
+pages = {e1005910},
+publisher = {Public Library of Science},
+title = {{Prediction of infectious disease epidemics via weighted density ensembles}},
+url = {http://dx.plos.org/10.1371/journal.pcbi.1005910},
+volume = {14},
+year = {2018}
+}
+@article{Pei2018,
+abstract = {Recurrent outbreaks of seasonal and pandemic influenza create a need for forecasts of the geographic spread of this pathogen. Although it is well established that the spatial progression of infection is largely attributable to human mobility, difficulty obtaining real-time information on human movement has limited its incorporation into existing infectious disease forecasting techniques. In this study, we develop and validate an ensemble forecast system for predicting the spatiotemporal spread of influenza that uses readily accessible human mobility data and a metapopulation model. In retrospective state-level forecasts for 35 US states, the system accurately predicts local influenza outbreak onset,-i.e., spatial spread, defined as the week that local incidence increases above a baseline threshold-up to 6 wk in advance of this event. In addition, the metapopulation prediction system forecasts influenza outbreak onset, peak timing, and peak intensity more accurately than isolated location-specific forecasts. The proposed framework could be applied to emergent respiratory viruses and, with appropriate modifications, other infectious diseases.},
+author = {Pei, Sen and Kandula, Sasikiran and Yang, Wan and Shaman, Jeffrey},
+doi = {10.1073/pnas.1708856115},
+file = {::},
+issn = {1091-6490},
+journal = {Proceedings of the National Academy of Sciences of the United States of America},
+keywords = {data assimilation,human mobility,influenza forecast,metapopulation model,spatial transmission},
+month = {mar},
+number = {11},
+pages = {2752--2757},
+pmid = {29483256},
+publisher = {National Academy of Sciences},
+title = {{Forecasting the spatial transmission of influenza in the United States.}},
+url = {http://www.ncbi.nlm.nih.gov/pubmed/29483256 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC5856508},
+volume = {115},
+year = {2018}
+}
+@article{Osthus2017,
+abstract = {Timely and accurate forecasts of seasonal influenza would assist public health decision-makers in planning intervention strategies, efficiently allocating resources, and possibly saving lives. For these reasons, influenza forecasts are consequential. Producing timely and accurate influenza forecasts, however, have proven challenging due to noisy and limited data, an incomplete understanding of the disease transmission process, and the mismatch between the disease transmission process and the data-generating process. In this paper, we introduce a dynamic Bayesian (DB) flu forecasting model that exploits model discrepancy through a hierarchical model. The DB model allows forecasts of partially observed flu seasons to borrow discrepancy information from previously observed flu seasons. We compare the DB model to all models that competed in the CDC's 2015--2016 flu forecasting challenge. The DB model outperformed all models, indicating the DB model is a leading influenza forecasting model.},
+archivePrefix = {arXiv},
+journal = {arXiv},
+arxivId = {1708.09481},
+author = {Osthus, Dave and Gattiker, James and Priedhorsky, Reid and {Del Valle}, Sara Y.},
+eprint = {1708.09481},
+file = {::},
+month = {aug},
+title = {{Dynamic Bayesian Influenza Forecasting in the United States with Hierarchical Discrepancy}},
+url = {http://arxiv.org/abs/1708.09481},
+year = {2017}
+}