forked from Shrinad/big_data_bootcamp_2016
-
Notifications
You must be signed in to change notification settings - Fork 0
/
complete.R
118 lines (72 loc) · 3.03 KB
/
complete.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
library(dplyr)
library(magrittr)
library(tm)
library(topicmodels)
library(jsonlite)
set.seed(1300)
options(mc.cores = 1)
sotu = fromJSON(txt = './sotu_parsed.json')
# TM Pipeline
sotu_corpus <- Corpus(VectorSource(sotu$content)) %>%
tm_map(x = ., FUN = PlainTextDocument) %>%
tm_map(x = ., FUN = removePunctuation) %>%
tm_map(x = ., FUN = removeNumbers) %>%
tm_map(x = ., FUN = removeWords, stopwords(kind = 'en')) %>%
tm_map(x = ., FUN = stripWhitespace)
stopwords_extra <- c(stopwords(kind = 'en'), 'will', 'thank', 'can')
dtm <- DocumentTermMatrix(x = Corpus(VectorSource(sotu$content)),
control = list(tokenize = words,
tolower = TRUE,
stopwords = stopwords_extra,
stemming = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE,
minDocFreq = length(sotu$content) * 0.05,
maxDocFreq = length(sotu$content) * 0.80,
weighting = weightTf))
doc_term <- DocumentTermMatrix(sotu_corpus)
doc_term$dimnames$Docs <- sotu$file_name
tf_idf <- weightTfIdf(m = doc_term, normalize = TRUE)
tf_idf_mat <- as.matrix(tf_idf)
tf_idf_dist <- dist(tf_idf_mat, method = 'euclidean')
# Cluster: KMeans
tf_idf_norm <- tf_idf_mat / apply(tf_idf_mat, MARGIN = 1, FUN = function(x) sum(x^2)^0.5)
km_clust <- kmeans(x = tf_idf_norm, centers = 5, iter.max = 25)
pca_comp <- prcomp(tf_idf_norm)
pca_rep <- data_frame(sotu_name = sotu$file_name,
pc1 = pca_comp$x[,1],
pc2 = pca_comp$x[,2],
clust_id = as.factor(km_clust$cluster))
# LDA
lda <- LDA(x = dtm, k = 8, method = 'VEM')
terms(x = lda, k = 10)
topics(x = lda, k = 5)
# DF: Terms
term_extract <- terms(lda, k = 25)
v_topics <- c(); v_terms <- c(); v_ranks <- c(); v_betas <- c()
for (topic in 1:lda@k) {
for (term in 1:25) {
v_topics <- c(v_topics, topic)
v_terms <- c(v_terms, term_extract[term, topic])
v_ranks <- c(v_ranks, term)
v_betas <- c(v_betas, lda@beta[topic, which(lda@terms == term_extract[term, topic])])
}
}
term_rankings <- data_frame(topic = v_topics,
term = v_terms,
rank = v_ranks,
term_beta = v_betas)
rm(v_topics, v_terms, v_ranks, v_betas)
# DF: Document Distributions
v_topics <- c(); v_docs <- c(); v_gammas <-c()
for (topic in 1:lda@k) {
for (doc in 1:length(lda@documents)) {
v_topics <- c(v_topics, topic)
v_docs <- c(v_docs, lda@documents[doc])
v_gammas <- c(v_gammas, lda@gamma[doc, topic])
}
}
doc_distributions <- data_frame(topic = v_topics,
doc = v_docs,
doc_gamma = v_gammas)
rm(v_topics, v_docs, v_gammas)