上面条条的长度对应的是超级英雄使用每个词汇的程度
可视化过程
最后,分析完全剧的角色,我们也来一起看看整个可视化过程。
导入R语言包
library(dplyr) library(grid) library(gridExtra) library(ggplot2) library(reshape2) library(cowplot) library(jpeg) library(extrafont)
清除R工作环境中的全部东西
rm(list = ls())
加载包含所有图片的文件夹(根据你自己的情况修改代码)
dir_images <- "C:\\Users\\Matt\\Documents\\R\\Avengers" setwd(dir_images)
设置字体
windowsFonts(Franklin=windowsFont("Franklin Gothic Demi"))
英雄角色名字的简化版本
character_names <- c("black_panther","black_widow","bucky","captain_america", "falcon","hawkeye","hulk","iron_man", "loki","nick_fury","rhodey","scarlet_witch", "spiderman","thor","ultron","vision") image_filenames <- paste0(character_names, ".jpg")
将所有图片读入一个列表中。
all_images <- lapply(image_filenames, read_image)
将角色名字分配给图像列表,以便按名字对其进行索引。
names(all_images) <- character_names
例如
# clear the plot window grid.newpage() # draw to the plot window grid.draw(rasterGrob(all_images[['vision']]))
获得文本数据
数据由计算机科学家Elle O'Brien收集的,使用文本挖掘技术对电影剧本分析。
更正专有名称的大写
capitalize <- Vectorize(function(string){ substr(string,1,1) <- toupper(substr(string,1,1)) return(string) }) proper_noun_list <- c("clint","hydra","steve","tony", "sam","stark","strucker","nat","natasha", "hulk","tesseract", "vision", "loki","avengers","rogers", "cap", "hill") # Run the capitalization function word_data <- word_data %>% mutate(word = ifelse(word %in% proper_noun_list, capitalize(word), word)) %>% mutate(word = ifelse(word == "jarvis", "JARVIS", word))
请注意,以前的简版角色名字与文本dataframe格式中的角色不匹配。
unique(word_data$Speaker) ## [1] "Black Panther" "Black Widow" "Bucky" ## [4] "Captain America" "Falcon" "Hawkeye" ## [7] "Hulk" "Iron Man" "Loki" ## [10] "Nick Fury" "Rhodey" "Scarlet Witch" ## [13] "Spiderman" "Thor" "Ultron" ## [16] "Vision"
创建一个索引表,将文件名转换为角色名。
character_labeler <- c(`black_panther` = "Black Panther", `black_widow` = "Black Widow", `bucky` = "Bucky", `captain_america` = "Captain America", `falcon` = "Falcon", `hawkeye` = "Hawkeye", `hulk` = "Hulk", `iron_man` = "Iron Man", `loki` = "Loki", `nick_fury` = "Nick Fury", `rhodey` = "Rhodey",`scarlet_witch` ="Scarlet Witch", `spiderman`="Spiderman", `thor`="Thor", `ultron` ="Ultron", `vision` ="Vision")
有两个不同版本的角色名,一个用于显示(漂亮),一个用于索引(简单)
convert_pretty_to_simple <- Vectorize(function(pretty_name){ # pretty_name = "Vision" simple_name <- names(character_labeler)[character_labeler==pretty_name] # simple_name <- as.vector(simple_name) return(simple_name) }) # convert_pretty_to_simple(c("Vision","Thor")) # just for fun, the inverse of that function convert_simple_to_pretty <- function(simple_name){ # simple_name = "vision" pretty_name <- character_labeler[simple_name] %>% as.vector() return(pretty_name) } # example convert_simple_to_pretty(c("vision","black_panther"))
## [1] "Vision" "Black Panther"
将简化的角色名称添加到文本数据框架中。
word_data$character <- convert_pretty_to_simple(word_data$Speaker)
为每个角色指定主颜色
character_palette <- c(`black_panther` = "#51473E", `black_widow` = "#89B9CD", `bucky` = "#6F7279", `captain_america` = "#475D6A", `falcon` = "#863C43", `hawkeye` = "#84707F", `hulk` = "#5F5F3F", `iron_man` = "#9C2728", `loki` = "#3D5C25", `nick_fury` = "#838E86", `rhodey` = "#38454E",`scarlet_witch` ="#620E1B", `spiderman`="#A23A37", `thor`="#323D41", `ultron` ="#64727D", `vision` ="#81414F" )
绘制条形图☟
avengers_bar_plot <- word_data %>% group_by(Speaker) %>% top_n(5, amount) %>% ungroup() %>% mutate(word = reorder(word, amount)) %>% ggplot(aes(x = word, y = amount, fill = character)) geom_bar(stat = "identity", show.legend = FALSE) scale_fill_manual(values = character_palette) scale_y_continuous(name ="Log Odds of Word", breaks = c(0,1,2)) theme(text = element_text(family = "Franklin"), # axis.title.x = element_text(size = rel(1.5)), panel.grid = element_line(colour = NULL), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "white", colour = "white")) # theme(strip.text.x = element_text(size = rel(1.5))) xlab("") coord_flip() facet_wrap(~Speaker, scales = "free_y") avengers_bar_plot
这已经非常漂亮了,但是还可以更漂亮。比如人物形象通过“线条”显示出来。具体做法是将透明的条形图全覆盖,然后从端点向里绘制白色的条形图,注意条形图是能够遮挡图片的。