-
Notifications
You must be signed in to change notification settings - Fork 0
/
EDA Zillow.R
executable file
·106 lines (97 loc) · 3.91 KB
/
EDA Zillow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
############################################################
# Zillow Prize: Zillow???s Home Value Prediction (Zestimate) #
############################################################
# Import libraries
library(data.table)
library(dplyr)
library(ggplot2)
library(stringr)
library(tidyr)
library(lubridate)
library(leaflet)
library(leaflet.extras)
# Importing Data
properties <- fread("~/Desktop/zillow/properties_2016.csv")
## convert lat/lon
properties <- properties %>%
mutate(latitude = latitude/1e6, longitude = longitude/1e6)
train <- fread("~/Desktop/zillow/train_2016_v2.csv")
# Preliminary Data Analysis
## Transaction volumn by date
train %>%
mutate(year_month = make_date(year=year(transactiondate),
month=month(transactiondate))) %>%
group_by(year_month) %>% count() %>%
ggplot(aes(x=year_month, y=n)) +
geom_bar(stat="identity",
color="black", fill="blue", alpha=.5) +
geom_vline(aes(xintercept=as.numeric(as.Date("2016-10-15"))), size=1)
## Distribution of logerror (99% percentile)
train %>%
filter(logerror %between% c(quantile(train$logerror, .005),
quantile(train$logerror, .995))) %>%
ggplot(aes(x=logerror)) +
geom_histogram(aes(y=..density..), bins=50,
color="black", fill="blue", alpha=.5) +
geom_density(alpha = .2, fill = "blue")
## Distribution of absolute logerror (99% percentile)
train %>%
filter(logerror %between% c(quantile(train$logerror, .005),
quantile(train$logerror, .995))) %>%
mutate(abslogerr = abs(logerror)) %>%
ggplot(aes(x=abslogerr)) +
geom_histogram(aes(y=..density..), bins=50,
color="black", fill="blue", alpha=.5) +
geom_density(alpha=.2, fill="blue")
## Mean of absolute logerror over time
train %>%
mutate(year_month = make_date(year=year(transactiondate),
month=month(transactiondate))) %>%
group_by(year_month) %>%
summarise(meanerr = mean(abs(logerror)),
stderr = sqrt(var(abs(logerror))/n())) %>%
ggplot(aes(x=year_month, y=meanerr)) +
geom_line(color="blue", linetype="dashed") +
geom_errorbar(aes(ymin=meanerr-1.96*stderr, ymax=meanerr+1.96*stderr),
color="blue", width=10) +
geom_point(size=2, color="blue")
## Distribution of mean absolute logerror by month (99% percentile)
train %>%
filter(logerror %between% c(quantile(train$logerror, .005),
quantile(train$logerror, .995))) %>%
mutate(year_month = as.factor(make_date(year=year(transactiondate),
month=month(transactiondate)))) %>%
mutate(abslogerr = abs(logerror)) %>%
ggplot(aes(x=abslogerr)) +
geom_histogram(aes(y=..density..), alpha=.5, fill="blue", bins=30) +
facet_wrap(~ year_month)
## logerror geographic distribution
properties %>%
inner_join(train, by="parcelid") %>%
mutate(longitude = longitude/1e6, latitude = latitude/1e6) %>%
# filter(parcelid %in% train$parcelid) %>%
group_by(longitude, latitude) %>%
summarise(logerror = mean(logerror)) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addHeatmap(lng = ~longitude, lat = ~latitude,
intensity = .02,
blur = 5, radius = 4,
group = "Property heatmap") %>%
addHeatmap(lng = ~longitude, lat = ~latitude,
intensity = ~logerror,
blur = 5, radius = 4,
group = "logerror heatmap") %>%
addLayersControl(
baseGroups = c("Property heatmap", "logerror heatmap"),
options = layersControlOptions(collapsed = FALSE)
)
## Missing percentage
properties %>%
filter(parcelid %in% train$parcelid) %>%
summarise_all(funs(sum(is.na(.))/n())) %>%
gather(key="feature", value="missing_pct") %>%
ggplot(aes(x=reorder(feature, missing_pct), y=missing_pct)) +
geom_bar(stat="identity",
color="black", fill="blue", alpha=.5) +
coord_flip()