-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda.R
47 lines (33 loc) · 1.25 KB
/
lda.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
library(topicmodels)
#set parameter for gibbs
burnin <- 4000
iter <- 2000
thin <- 500
seed <- list(2003,5,63,100 000, 765)
nstart < 5
best <- TRUE
#no of topics
k <- 4
dtm.lda <- DocumentTermMatrix(corpus, control = list(weighting = weightTf))
ldaout <- LDA(dtm.lda.new , k, method = "Gibbs", control = liist(nstart = nstart,
seed = seed, burnin = burnin, iter = iter, thin = thin))
#docs to topics
ldaout.topics <- as.matrix(topics(ldaout))
#top 6 terms in each topic
ldaout.terms <- as.matrix(terms(ldaout, 6))
#prob associated with each topic assignment
topic_probabilities <- as.data.frame(ldaout@gamma)
#find relative importance of top 2 topics
topic1ToTopic2 <- lapply(1:nrow(dtm.lda.new), function(x)
sort(topic_probabilities [x,])[k] / sort(topic_probabilities[x,])[k-1])
#map document to topics
topic_vector <- as.vector(ldaout.topics)
lda_cluster <- cbind(df.lda.new, topic_vector)
#no. of topics
k <- 2
tokenizer <- function(x)
NGramTokenizer (x, weka_control(min = 2, max = 2))
BiGramTokenizer <- function(x)
NGramTokenizer (x, weka_control(min = 2, max = 2))
dtm.lda <- DTM(corpus, control = list(weighting = weightTf, tokenize = BigramTokenizer))
dtm.lda <- removeSparseTerms(dtm.lda, sparse = 0.98)