# R

# 1. 加载数据

下载新闻数据集

# 获取当前工作目录
getwd()

# 设定当前工作目录
setwd("/Users/{username}/project/")

# 载入数据,如果数据较大,推荐使用 data.table
# https://www.r-bloggers.com/efficiency-of-importing-large-csv-files-in-r/
library(data.table)

datac <- fread("page_entity_20180511.csv",sep = ',')

# 2. 预处理数据

# 从 HTML 中提取文本
library(htm2txt)

# 重命名列名
names(datac) <- c("id", "page_title", "page_url", "page_content", "page_host_id", "created_at", "updated_at", "page_image", "page_topics", "page_date")

titles <- datac$page_title
titles <- htm2txt(titles)
titles <- gsub("n\t+", "", titles)

urls <- datac$page_url
urls <- gsub("n\t+", "", urls)

datac$page_title <- titles
datac$page_url <- urls

write.csv(file="page_entity.csv", x=datac)

# 2.1 合并 dataframe

# 读取另一张存有正文内容的表格
data_content = fread("page_content_full.csv", sep=',')

datam <- merge(x=data, y=dfcontent, by.x="V3", by.y="page_url", all.x=TRUE)

datacontent <- datam[datam$page_content != "NA"]

# 3. 搜索关键词

text_with_s <- grep("美国", texts(titles))

text_with_s <- grep("美国", texts(titles), value=TRUE)

# 4. 分词处理

install.packages("jiebaR")
library("jiebaR")

cc = worker(bylines = TRUE, stop_word = "stopwords_zh_cn.txt")
tagging = worker("tag")

tagger = worker('tag')
tokencn = segment(contents, cc)

tags = vector_tag(tokencn, tagger)
tagsns = tags[grep("^n", names(tags))]

extract = worker("keywords", topn = 10)
kws = vector_keywords(tokencn)

# 4. 计算特征词,词义复杂度

cal_dfm <- function(id, data) {
  datap = data[data$V5 == id]

  sizep = lengths(datap)[1]

  contents = datap$cleancontent

  tokensp = tokens(contents)
  tokencn = segment(contents, cc)
  dfmp <- dfm(tokensp, remove = stopwords("chinese"), remove_punct = TRUE)
  topfeatures(dfmp, n=20)
  dfmp
}

cal_dfm_tfidf <- function(dfm) {
  dfm_tfidf <- dfm_tfidf(dfm, scheme_tf = "prop")
  topfeatures(dfm_tfidf, n=20)
  dfm_tfidf
}

cal_ttr <- function(id, data) {
  datap = data[data$V5 == id]
  contents = datap$cleancontent
  tokensp = tokens(contents)
  trr = ntype(tokensp) / lengths(tokensp)
  hist(ttr)
  ttr
}

calfeature(1, data)
getttr(1, data)

# 5. LDA

# 5.1 计算 LDA 主题数量

dfm1 <- dfm(datacontent$page_content, remove = stopwords("chinese"), remove_punct = TRUE)

result <- FindTopicsNumber(
  dfm1,
  topics = seq(from = 2, to = 120, by = 2),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)
FindTopicsNumber_plot(result)

number_of_topics = 40

# 5.2 计算 LDA 矩阵

tm1 <- LDA(dfm1, k = number_of_topics, method = "Gibbs",  control = list(seed = 1234))

library(tidytext)
tmt1 <- tidy(tm1, matrix="beta")

terms1 <- tmt1  %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

terms1 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = 'STHeiti'))

# 5.3 对主题分类

post <- posterior(tm1, dfm1)[["topics"]]

postmat <- as.data.frame(post)

postdat <- add_rownames(postmat, "doc_name")

colnames(postdat)[-1] <- apply(terms(tm1, 3), 2, paste, collapse=", ")

heatmap(post, scale="none")

group_documents <- function(topic_vec) {
  sorted <- sort(topic_vec, decreasing = T, index.return=T)
  dists <- sorted$x
  index <- sorted$ix
  max <- as.numeric(dists[1])
  if (max < 0.7)
    return(c())
  else {
    threshold <- max * 0.8
    names(dists) <- index
    return(dists[dists > threshold])
  }
}

groups <- sapply(postdat[-1], group_documents)

# FRE

cal_fre <- function(ids, data) {
  datap <- data[, c("V5", "cleancontent")]
  datap <- datap[V5 %in% ids]
  all_measures <- textstat_readability(datap$cleancontent, c("Flesch", "Dale.Chall", "SMOG", "Coleman.Liau", "Fucks"))
}

all_measures <- cal_fre(c(1), data)

readability_matrix <- cbind(all_measures$Flesch, all_measures$Dale.Chall, all_measures$SMOG, all_measures$Coleman.Liau, all_measures$Fucks)

fre_core <- cor(readability_matrix)
fre_core

datap$fre_flesch <- all_measures$Flesch
datap$fre_dc <- all_measures$Dale.Chall
datap$fre_smog <- all_measures$SMOG
datap$fre_cl <- all_measures$Coleman.Liau
datap$fre_fucks <- all_measures$Fucks

hist(datap[datap$V5 == 1]$fre_flesch, xlim=c(0,120), breaks="FD")

std <- function(x) sd(x)/sqrt(length(x))

# 保存 workspace

save.image("workspace.RData")
  1. R topicmodels
  2. LDA topic number tuning