Web Scraping & Text Mining

Libraries

library(rvest) #web scrapping
library(wordcloud2) #word cloud graph 
library(tm) #text mining 
library(SnowballC) #text stemming 
library(RColorBrewer) #color palettes

Web Scrapping

#reading the web page 
url="https://www.artofmanliness.com/second-inaugural-address-of-abraham-lincoln/"
speech<-read_html(url)

#grab the title 
speech_title<-speech %>% 
  html_node("title") %>% 
  html_text()

#grab speech text 
speech_text<-speech %>% 
  html_nodes("p") %>% 
  html_text()

Text Mining

#reading the text as corpus 
speech_text<-Corpus(VectorSource(speech_text))


#cleaning the text 
#1.0 lowering the text 
text<-speech_text %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeNumbers) %>% 
  tm_map(removeWords,stopwords("english")) %>% 
  tm_map(removeWords,c("else","yet","due","see","away","ago","aom")) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(stripWhitespace)
  #tm_map(stemDocument)

#Building term matrix document matrix
text_matrix<-TermDocumentMatrix(text)
text_matrix<-as.matrix(text_matrix)

#sorting and summing the word frequencies 
sort_text<-sort(rowSums(text_matrix),decreasing = TRUE)
sort_df<-data.frame(words=names(sort_text),frequency=sort_text)

Word-cloud Plotting

#random
set.seed(123)

#wordcloud
sort_df %>% 
  wordcloud2(size = 2, 
             fontFamily = "Comic Sans MS",
             shuffle = "TRUE",
             color = "random-light")

Web Scraping & Text Mining

Data Analyst Moses Kioko

3rd March 2020