forked from melsod/OCSWinter2020
-
Notifications
You must be signed in to change notification settings - Fork 0
/
followup_analysis.R
332 lines (253 loc) · 15.3 KB
/
followup_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#······································································································
#································Get Trial-by-Trial Data ··············································
#······································································································
###### Unless you want to recompute the dataframe skip to Eliminate excluded participants section
#install.packages("jsonlite")
library("jsonlite")
flag<-TRUE
# *************************************************************************************************************
# ********** Users unfamiliar with Git LFS may have difficulty getting/using the final_data.json file *********
# ********** So the code that relies on this file is skipped by default, change the flag parameter to *********
# ********** TRUE if you have the final_data.json file and want to recompute the dataframe. The *********
# ********** tidied dataframe should be saved as trial_data.csv in the dataframe already so *********
# ********** recomputing the data should be unnecessary (leave flag as false) *********
# *************************************************************************************************************
# SKIP THE FOLLOWING CODE BY DEFAULT
if(flag==TRUE){
# pull in data (should work assuming you have cloned the GitHub Repo)
data <- unlist(jsonlite::fromJSON("./data/final_data.json"), recursive = FALSE, use.names = TRUE)
# install.packages("plyr")
library("plyr")
# collapse a JSON level
data <- rbind.fill(data)
# funciton to pull JSON data
extract_json <- function(key, survey_data){
return(survey_data[[max(grep(key, survey_data))]])
}
# create empty dataframe
tidy_data <- data.frame(matrix(0, length(unique(data$subject))*34, 21))
# set count
count<-1
# for each subject in the data
for(i in 1:length(unique(data$subject))){
# get the data for just 1 participant
sub_data <- data[data$subject == unique(data$subject)[i],]
# get that participants survey data
survey_data <- sub_data$responses[sub_data$phase == "Survey"]
survey_data <- survey_data[! is.na(survey_data)]
# extract JSON data
survey_data <- sapply(survey_data, fromJSON)
# get subject ID code
subject<-sub_data$subject[1]
# get childcare experience in months
childcare<-as.numeric(extract_json("Childcare", survey_data))
# program required a numerical response but allowed responses like "12 months" (as long as it started with a number)
# this section extracts the data if that happened
# if trying to make childcare numeric forced an NA then
if(is.na(childcare)==TRUE){
# get the string value and split it on spaces
childcare_txt<-strsplit(extract_json("Childcare", survey_data)[[1]],split = " ")
# if there is only 1 value after the split (meaning no spaces)
if(length(childcare_txt[[1]])==1){
# then split on "-" (they probably wrote "4-6")
childcare_txt<-strsplit(extract_json("Childcare", survey_data)[[1]],split = "-")
# then take the mean of the first and last number as our estimate
childcare<-mean(c(as.numeric(childcare_txt[[1]][1]),as.numeric(childcare_txt[[1]][-1])))
# or else
}else{
# if the first letter of the second "word" is y then we'll assume they wrote something like "2 years"
if(tolower(substring(text = childcare_txt[[1]][2],first = 1,last = 1))=="y"){
# then multiply the numerical value by 12 and use that value
childcare<-(12)*as.numeric(childcare_txt[[1]][1])
# or else
}else{
# assume they wrote something like "12 months" and just use the numerical value
childcare<-as.numeric(childcare_txt[[1]][1])
}
}
}
# end section accounting for bad data entry for childcare
# Do the same steps as above (for Childcare) for the variable Caregiving
caregiver<-as.numeric(extract_json("Caregiver", survey_data)[[1]])
if(is.na(caregiver)==TRUE){
caregiver_txt<-strsplit(extract_json("Caregiver", survey_data)[[1]],split = " ")
if(length(caregiver_txt[[1]])==1){
caregiver_txt<-strsplit(extract_json("Caregiver", survey_data)[[1]],split = "-")
caregiver<-mean(c(caregiver_txt[[1]][1],caregiver_txt[[1]][-1]))
}else{
if(tolower(substring(text = caregiver_txt[[1]][2],first = 1,last = 1))=="y"){
caregiver<-(12)*as.numeric(caregiver_txt[[1]][1])
}else{
caregiver<-as.numeric(caregiver_txt[[1]][1])
}
}
}
# end section accoutn for bad data entry for caregiving
# get participants age
par_age<-as.numeric(extract_json("Par_Age", survey_data))
# get participant's gender
par_gen<-as.character(extract_json("Gender_Q", survey_data)[[1]])
# get the text the participant chose to write in the "Other gender" box
par_gen_other<-as.character(extract_json("Gender_Q", survey_data)[[2]])
# get the participant's country of residence
par_country<-as.character(extract_json("Country_Q", survey_data)[[1]])
# get the text that the aprticipant chose to write int he "Other Country" box
par_country_other<-as.character(extract_json("Country_Q", survey_data)[[2]])
# get whether the participant has normal hearing
hearing<-as.character(extract_json("Normal_hearing", survey_data))
# get whether the participant has normal hearing
engl_first<-as.character(extract_json("Engl_first_lang", survey_data))
# get whether the participant knows any of the corpus languages
know_corp<-as.character(paste0(extract_json("Know_corp_lang", survey_data)))
# get whether the participant is monolingual
monolingual<-as.character(extract_json("monolingual", survey_data))
# for each data point (for the one participant we have singled out)
for(j in 1:nrow(sub_data)){
# if this is an experimental trial
if(sub_data$correct[j] %in% c(0,1)){
# then update the dataset we are creating with all the data (otherwise do nothing)
tidy_data[count,] <- data.frame(subject,
sub_data$correct[j],
sub_data$phase[j],
sub_data$time_elapsed[j]/60000,
childcare,
caregiver,
par_age,
par_gen,
par_gen_other,
par_country,
par_country_other,
hearing,
engl_first,
know_corp,
monolingual,
sub_data$age_group[j],
sub_data$button_pressed[j],
sub_data$gender[j],
sub_data$language[j],
sub_data$unique_id[j],
substr(sub_data$stimulus[j],53,66),
stringsAsFactors = FALSE)
# updat the count
count<-count+1
}
}
}
# set the variable names
colnames(tidy_data) <- c("subject_ID", "correct", "phase", "time_ellapsed_mins", "childcare", "caregiver",
"age", "gender", "gender_text", "country", "country_text", "hearing",
"eng_first", "know_corp_lang", "monolingual", "stim_ageGroup","button_pressed","stim_gender","stim_language","stim_ID","clipID")
}
# END DEFAULT SKIPPING CODE
#··············································································································
#································Save trial by trail data as csv ··············································
#··············································································································
# *************************************************************************************************************
#************** If you have successfully run the above code then you can save the dataframe for later *********
#write.csv(tidy_data, "./data/trial_data.csv", row.names = FALSE) ####### uncomment this line to run it
# *************************************************************************************************************
#··············································································································
#································Eliminate excluded participants ··············································
#··············································································································
###### REQUIRES w_exclusions_summarized_data.csv file. If not found then run data_cleaning.R to create the file
#install.packages("readr")
library(readr)
# *************************************************************************************************************
#************** If you have a previously saved trial_data.csv then this is quicker**************
tidy_data <- read_csv("data/trial_data.csv")
#**************************************************************************************************************
# get summarized datafile that includes participant exclusions (created by data_cleaning.R)
w_exclusions_data <- read_csv("data/w_exclusions_summarized_data.csv")
# only include the data from participants who pass exclusion criteria
tidy_data<-subset(tidy_data,tidy_data$subject_ID %in% w_exclusions_data$subject_ID)
#··············································································································
#································Save trial by trail data as csv w exclusions ·································
#··············································································································
# uncomment if you want to resave the trial-by-trial data with the appropriate participants excluded
# it should be already be found in the data folder of the repository
#write.csv(tidy_data, "./data/w_exclusions_trial_data.csv", row.names = FALSE)
#··············································································································
#································Follow Up Analysis ···························································
#··············································································································
# identify infant age
# Random intercept Model: Baseline
library(lme4)
model1<- glmer(correct~1+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Age"),
family = binomial(link=logit))
summary(model1)
# Adding level 1 predictors: childcare & caregiver
model2<- glmer(correct~1+childcare+caregiver+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Age"),
family = binomial(link=logit))
summary(model2)
# Compare two models
anova(model1,model2)
#additional model with only childcare
model4<- glmer(correct~1+childcare+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Age"),
family = binomial(link=logit))
summary(model4)
#compare to baseline model
anova(model1,model4)
# Adding level 2 predictor: Participant's gender
model3<- glmer(correct~1+childcare+caregiver+gender+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Age"),
family = binomial(link=logit))
summary(model3)
# Compare two models
anova(model2,model3)
#`````````````````````````````````````````````````````````````````````````````````````````````````````````````
# identify infant language
# Random intercept Model: Baseline
library(lme4)
model1<- glmer(correct~1+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Language"),
family = binomial(link=logit))
summary(model1)
# Adding level 1 predictors: childcare & caredgiver
model2<- glmer(correct~1+childcare+caregiver+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Language"),
family = binomial(link=logit))
summary(model2)
# Compare two models
anova(model1,model2)
# Adding level 2 predictor: Participant's gender
model3<- glmer(correct~1+childcare+caregiver+gender+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Language"),
family = binomial(link=logit))
summary(model3)
# Compare two models
anova(model2,model3)
#`````````````````````````````````````````````````````````````````````````````````````````````````````````````
# identify infant sex (at request of reviewers)
# Random intercept Model: Baseline
library(lme4)
model1<- glmer(correct~1+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Sex"),
family = binomial(link=logit))
summary(model1)
# Adding level 1 predictors: childcare & caredgiver
model2<- glmer(correct~1+childcare+caregiver+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Sex"),
family = binomial(link=logit))
summary(model2)
# Compare two models
anova(model1,model2)
# Adding level 2 predictor: Participant's gender
model3<- glmer(correct~1+childcare+caregiver+gender+(1|subject_ID),
data = subset(tidy_data,
tidy_data$gender%in%c("Male","Female") & tidy_data$phase=="Sex"),
family = binomial(link=logit))
summary(model3)
# Compare two models
anova(model2,model3)