-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_exploration_r.R
108 lines (88 loc) · 5.19 KB
/
data_exploration_r.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#this function uses the user_count column to give accurate frequencies of each password
#instead of just assuming every password occurs at the same frequencies
user_count_summer <-function(df, col_name){
column_values <- c(0:max(top_200_password_2020_by_country_extended[,c(col_name)]))
sums <- c()
for(i in column_values){
df_temp <- df[df[,c(col_name)] == i,]
freq_sum <- sum(df_temp$User_count)
sums <- c(sums, freq_sum)
}
return(sums)
}
#breaking down each country into its own DF
df_china <- top_200_password_2020_by_country_extended[top_200_password_2020_by_country_extended$country == "China", ]
df_us <- top_200_password_2020_by_country_extended[top_200_password_2020_by_country_extended$country == "United States", ]
df_russia <- top_200_password_2020_by_country_extended[top_200_password_2020_by_country_extended$country == "Russia", ]
df_vietnam <- top_200_password_2020_by_country_extended[top_200_password_2020_by_country_extended$country == "Vietnam", ]
df_spain <- top_200_password_2020_by_country_extended[top_200_password_2020_by_country_extended$country == "Spain", ]
#using our function to sum up lengths with the user_count variable
lengths_china <- user_count_summer(df_china, "length")
lengths_us <- user_count_summer(df_us, "length")
lengths_russia <- user_count_summer(df_russia, "length")
lengths_vietnam <- user_count_summer(df_vietnam, "length")
lengths_spain <- user_count_summer(df_spain, "length")
nums_china <- user_count_summer(df_china, "unique_chars")
nums_us <- user_count_summer(df_us, "unique_chars")
nums_russia <- user_count_summer(df_russia, "unique_chars")
nums_spain <- user_count_summer(df_vietnam, "unique_chars")
nums_vietnam <- user_count_summer(df_spain, "unique_chars")
#plotting them by country
barplot(lengths_china, names.arg = lengths)
barplot(lengths_us, names.arg = lengths)
barplot(lengths_russia, names.arg = lengths)
barplot(lengths_vietnam, names.arg = lengths)
barplot(lengths_spain, names.arg = lengths)
#running anova on each country with user_count weighting the lengths
summary(aov(length ~ country, data=top_200_password_2020_by_country_extended, weights=User_count))
#running anova on unique_chars
summary(aov(unique_chars ~ country, data=top_200_password_2020_by_country_extended, weights=User_count))
#running anova on # of numbers
summary(aov(numbers ~ country, data=top_200_password_2020_by_country_extended, weights=User_count))
#running anova on sequentials
summary(aov(sequentials ~ country, data=top_200_password_2020_by_country_extended, weights=User_count))
library(pwr)
pwr.anova.test(k =5 , n =200, f =3.932, sig.level = 0.05)
#checking assumptions, lets check normality and equal variance with the residuals vs fitted and QQ plots
model_len <- lm(length ~ factor(country),data=top_200_password_2020_by_country_extended, weights=User_count)
model_uniq <- lm(unique_chars ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
model_nums <- lm(numbers ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
model_seq <- lm(sequentials ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
plot(model_len)
plot(model_uniq)
plot(model_nums)
plot(model_seq)
#trying to fix assumptions via logarithms
model_len_log <- lm(log(length,10) ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
model_uniq_log <- lm(log(unique_chars,10) ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
model_nums_log <- lm(log(numbers,10) ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
model_seq_log <- lm(log(sequentials,10) ~ country,data=top_200_password_2020_by_country_extended, weights=User_count)
plot(model_len_log)
plot(model_uniq_log)
plot(model_nums_log)
plot(model_seq_log)
summary()
#checking sample size
library(Hmisc)
lengths <- c(0:20)
sqrt(wtd.var(lengths, nums_russia))
sqrt(wtd.var(lengths, nums_china))
sqrt(wtd.var(lengths, nums_us))
sqrt(wtd.var(lengths, nums_spain))
sqrt(wtd.var(lengths, nums_vietnam))
#pairwise t test across langauges
#length
wtd.t.test(lengths, lengths, weight=lengths_us, weighty=lengths_china, bootse=TRUE, bootn=20)
#linear regression with robust SE, using weighted linear regression
#compare the linear regression with non-robust SE's and robust SE's to see how much influence the non-constant variance has
library(lmtest)
summary(model_len)
coeftest(model_len, vcov = vcovHC(model_len))
coeftest(model_uniq, vcov = vcovHC(model_uniq))
a <- aov(length ~ country, data=top_200_password_2020_by_country_extended, weights=User_count)
pairwise.t.test(top_200_password_2020_by_country_extended$length, top_200_password_2020_by_country_extended$country, weights=User_count)
pairwise.t.test(top_200_password_2020_by_country_extended$unique_chars, top_200_password_2020_by_country_extended$country, weights=User_count)
pairwise.t.test(top_200_password_2020_by_country_extended$numbers, top_200_password_2020_by_country_extended$country, weights=User_count)
pairwise.t.test(top_200_password_2020_by_country_extended$sequentials, top_200_password_2020_by_country_extended$country, weights=User_count)
bartlett.test(top_200_password_2020_by_country_extended$length, top_200_password_2020_by_country_extended$country)
t.test()