imdb_analysis.Rmd

---
title: "Analysis"
author: "Anabel Berjón Sánchez, Ba Linh Le, Adina Spertus-Melhus"
output:
    html_document:
    theme: paper
    toc: yes
    toc_float:
      collapsed: false
---

# Setup
```{r setup}
knitr::opts_chunk$set(echo = TRUE)

library(dplyr)
library(data.table)
library(plm) 
library(stringr)
library(stargazer)
```

# Topics overview

``` {r }
Topics <- paste0("Topic ", 1:10, "")
Genres <- c("Battle / Space War", "Life / Career", "Martial Arts / Historical", "Rural / Family / Historical", "Movies (?)", "Horror", "Romance", "Friendships (?)", "Crime", " (?)")

overview <- data.frame(cbind(Topics, Genres)) %>% rename("Associated Genres" = Genres)
saveRDS(overview, file = "data/overview_topics.rds")
overview
```


# Kaggle data analyses

## Data preparation
```{r prepare data}
# Import data
dat_kaggle <- readRDS("data/dat_kaggle.rds") 

# Change column names
names(dat_kaggle)[4] <- "Topic_1"
names(dat_kaggle)[5] <-"Topic_2"
names(dat_kaggle)[6] <-"Topic_3"
names(dat_kaggle)[7] <-"Topic_4"
names(dat_kaggle)[8] <-"Topic_5"
names(dat_kaggle)[9] <-"Topic_6"
names(dat_kaggle)[10] <-"Topic_7"
names(dat_kaggle)[11] <-"Topic_8"
names(dat_kaggle)[12] <-"Topic_9"
names(dat_kaggle)[13] <-"Topic_10"
```


First, we will perform Least Squares Dummy Variables (LSDV) estimation with lm() to get an individual estimate for each unit. Second, we will run our model with plm(), which will do the same mechanics, yet it will not render each of the units intercept.

## Topics Model 
``` {r topic model , warning = FALSE}
topics_only_model <- lm(rating ~ Topic_1 + Topic_2 + Topic_3 + 
                      Topic_4 + Topic_5+  Topic_6 + Topic_7 + 
                      Topic_8 + Topic_9 + Topic_10, data = dat_kaggle)
summary(topics_only_model)
saveRDS(topics_only_model, file = "Tables/topics_only_model.rds")
```

```{r, results = 'asis', warning = FALSE}
stargazer(topics_only_model, type = "html",
          title = "Topic Model: Which Topics have better ratings?", 
          omit.stat=c("LL","ser","f","adj.rsq"), #omit character vector
          dep.var.labels.include = FALSE,
          column.labels = c("Rating"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")

```

## Country Model 
``` {r country model, warning = FALSE}
# Transform country to categorical variable
dat_kaggle$country=as.factor(dat_kaggle$country)

# Create country dummy variables
# USA
dat_kaggle$USA <- NA 
dat_kaggle$USA[dat_kaggle$country=="USA"]<-1
dat_kaggle$USA[is.na(dat_kaggle$USA)]<- 0
# India
dat_kaggle$India <- NA
dat_kaggle$India[dat_kaggle$country=="India"]<-1
dat_kaggle$India[is.na(dat_kaggle$India)]<- 0
# UK
dat_kaggle$UK <- NA
dat_kaggle$UK[dat_kaggle$country=="UK"]<-1
dat_kaggle$UK[is.na(dat_kaggle$UK)]<- 0
# Canada
dat_kaggle$Canada <- NA
dat_kaggle$Canada[dat_kaggle$country=="Canada"]<-1
dat_kaggle$Canada[is.na(dat_kaggle$Canada)]<- 0
# France
dat_kaggle$France <- NA
dat_kaggle$France[dat_kaggle$country=="France"]<-1
dat_kaggle$France[is.na(dat_kaggle$France)]<- 0

# Country Model
country_model <- lm(rating ~ USA + India + UK + Canada + 
                France, data = dat_kaggle)
summary(country_model)
saveRDS(country_model, file = "Tables/country_model.rds")
```

```{r, results = 'asis', warning = FALSE}
stargazer(country_model, type = "html",  
          title = "Country Model: USA, India, UK, Canada and France", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```


## Duration Model 
```{r duration model, warning = FALSE}
# Duration Model
duration_model <- lm(rating ~ duration, data = dat_kaggle)
summary(duration_model)
saveRDS(duration_model, file = "Tables/duration_model.rds")
# Note: longer films have higher ratings
```

```{r, results = 'asis', warning = FALSE}
stargazer(duration_model, type = "html", 
          title = "Duration Model", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Horror Model
```{r horror model, warning = FALSE}
# Create country dummy variables
# South Korea
dat_kaggle$SK <- NA
dat_kaggle$SK[dat_kaggle$country=="South Korea"]<-1
dat_kaggle$SK[is.na(dat_kaggle$SK)]<- 0
# Sweden
dat_kaggle$Sweden <- NA
dat_kaggle$Sweden[dat_kaggle$country=="Sweden"]<-1
dat_kaggle$Sweden[is.na(dat_kaggle$Sweden)]<- 0
# Japan
dat_kaggle$Japan <- NA
dat_kaggle$Japan[dat_kaggle$country=="Japan"]<-1
dat_kaggle$Japan[is.na(dat_kaggle$Japan)]<- 0


# Horror Model
sk_horror_model <- lm(rating ~ SK*Topic_6, data = dat_kaggle)
summary(sk_horror_model)

sweden_horror_model <- lm(rating ~ Sweden*Topic_6, data = dat_kaggle)
summary(sweden_horror_model)

japan_horror_model <- lm(rating ~ Japan*Topic_6, data = dat_kaggle)
summary(japan_horror_model)

saveRDS(sk_horror_model, file = "Tables/sk_horror_model.rds")
saveRDS(sweden_horror_model, file = "Tables/sweden_horror_model.rds")
saveRDS(japan_horror_model, file = "Tables/japan_horror_model.rds")

#Note: Japanese movies add higher positive contributions to the rating. That is not the case for Japanese horror movies though, the effect of the interaction is statistically insignificant, negative and substantially small.
```

```{r, results = 'asis', warning = FALSE}
stargazer(sk_horror_model, sweden_horror_model, 
          japan_horror_model,
          type = "html", 
          title = "Who Makes Better Horror Movies? South Korea vs.Sweden", 
          omit.stat=c("LL","ser","f","adj.rsq"),
          dep.var.labels.include = FALSE,
          column.labels = c("Rating in SK", 
          "Rating in Sweden", "Rating in Japan"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Romance Model
``` {r romance model, warning = FALSE}
# Romance Model
france_romance_model <- lm(rating ~ France*Topic_7, data = dat_kaggle)
summary(france_romance_model)
uk_romance_model <- lm(rating ~ UK*Topic_7, data = dat_kaggle)
summary(uk_romance_model)
saveRDS(france_romance_model, file = "Tables/france_romance_model.rds")
saveRDS(uk_romance_model, file = "Tables/uk_romance_model.rds")

# Note: French romances have a statistically significant, negative effect on the rating. On th other hand, rating increases if a British movie turns out to be a romance. 
```

```{r, results = 'asis', warning = FALSE}
stargazer(france_romance_model,uk_romance_model,
          type = "html", 
          title = "Who Makes Better Romance Movies? France vs.UK", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating in France", "Rating in UK"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Crime Model 
```{r crime model, warning = FALSE}
# Create country dummy variable 
#Denmark
dat_kaggle$Denmark <- NA 
dat_kaggle$Denmark[dat_kaggle$country=="Denmark"]<-1
dat_kaggle$Denmark[is.na(dat_kaggle$Denmark)]<- 0

# Crime Model
denmark_crime_model <- lm(rating ~ Denmark*Topic_9, data = dat_kaggle)
sweden_crime_model <- lm(rating ~ Sweden*Topic_9, data = dat_kaggle)
saveRDS(denmark_crime_model, file = "Tables/denmark_crime_model.rds")
saveRDS(sweden_crime_model, file = "Tables/sweden_crime_models.rds")
# Note: Denmark crime movies fare better than Swedish crime movies.
```

```{r, results = 'asis', warning = FALSE}
stargazer(denmark_crime_model ,sweden_crime_model,
          type = "html",  
          title = "Who Makes Better Police Movies? Denmark vs.Sweden", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating in Denmark", "Rating in Sweden"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## USA Model
```{r USA mode, warning = FALSE}
# USA model
usa_crime_model <- lm(rating ~ USA*Topic_9, data = dat_kaggle)
usa_battle_model<- lm(rating ~ USA*Topic_1, data = dat_kaggle)
summary(usa_crime_model)
summary(usa_battle_model)
saveRDS(usa_crime_model, file = "Tables/usa_crime_model.rds")
saveRDS(usa_battle_model, file = "Tables/usa_battle_model.rds")
# Note: USA-made killer-cop-gang-serial films seem to do  better than the average USA-made film.
```

```{r, results = 'asis', warning = FALSE}
stargazer(usa_crime_model,usa_battle_model,
          type = "html", 
          title = "USA: Crime & Battle Movies", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating of Crime Movies", 
          "Rating of Battle Movies"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Indian Duration Model
```{r Indian duration model, warning = FALSE }
# India duration interaction effect model
india_len_model <- lm(rating ~ India*duration, data = dat_kaggle)
summary(india_len_model)
saveRDS(india_len_model, file = "Tables/india_len_model.rds")
# Note: We have derived above that Indian films tend to have higher ratings and that longer films have higher ratings, but when films are from India, a film's length has almost no predictive effect on its rating.
```

```{r, results = 'asis', warning = FALSE}
stargazer(india_len_model,
          type = "html",  
          title = "India: Are Longer Movies better than other Movies?", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Fixed Effect Model
```{r fe, warning = FALSE}
# Fixed effect model 
# unit_fe_kaggle <- lm(rating ~ Topic_1 + Topic_2 + Topic_3 + 
#                     Topic_4 + Topic_5 + Topic_6 + Topic_7 + Topic_8 + 
#                     Topic_9 + Topic_10 + as.factor(country), 
#                     data = dat_kaggle)
# saveRDS(unit_fe_kaggle, file = "Tables/unit_fe_kaggle.rds")
# Note: Constant is quite moderate at 6.9, and the effects of each topic are all below 1 rating unit.
```

```{r, results = 'asis', warning = FALSE}
stargazer(topics_only_model, unit_fe_kaggle,
          type = "html", 
          title = "Comparison between Topics and FE Topics Model", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Topics Model", " FE Topics Model"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

# Scraped data analyses 

## Data preparation 

``` {r prepare scraped data }
# Import scraped data 
dat_2020 <- readRDS("data/dat_2020.rds")

# Change column names
names(dat_2020)[9] <- "Topic_1"
names(dat_2020)[10] <-"Topic_2"
names(dat_2020)[11] <-"Topic_3"
names(dat_2020)[12] <-"Topic_4"
names(dat_2020)[13] <-"Topic_5"
names(dat_2020)[14] <-"Topic_6"
names(dat_2020)[15] <-"Topic_7"
names(dat_2020)[16] <-"Topic_8"
names(dat_2020)[17] <-"Topic_9"
names(dat_2020)[18] <-"Topic_10"

# Prepare country variable
dat_2020$Country <- as.character(dat_2020$Country)
dat_2020$Country <- str_trim(dat_2020$Country)
```

## Topics Model (2020)
``` {r scraped topics model, warning = FALSE }
# Topics model using scraped data 
topics_scraped_model <- lm(Rating ~ Topic_1 + Topic_2 + Topic_3 + 
                      Topic_4 + Topic_5 +  Topic_6 + Topic_7 + 
                      Topic_8 + Topic_9 + Topic_10, data = dat_2020)
summary(topics_scraped_model)
saveRDS(topics_scraped_model, file = "Tables/topics_scraped_model.rds")
```

```{r, results = 'asis', warning = FALSE}
stargazer(topics_scraped_model, 
          type = "html", 
          title = "Topic Model 2020: Which Topics have better ratings?", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Duration Model (2020)
```{r scraped duration model, warning = FALSE}
# Duration Model
duration_model_2020 <- lm(Rating ~ Duration, data = dat_2020)
summary(duration_model_2020)
saveRDS(duration_model_2020, file = "Tables/duration_model_2020.rds")
# Note: longer films have higher ratings
```

```{r, results = 'asis', warning = FALSE}
stargazer(duration_model_2020, 
          type = "html", 
          title = "Duration Model for 2020", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## British Model (2020)
```{r}
# UK
dat_2020$UK <- NA
dat_2020$UK[dat_2020$Country=="UK"]<-1
dat_2020$UK[is.na(dat_2020$UK)]<- 0

uk_romance_model_2020 <- lm(Rating ~ UK*Topic_7, data = dat_2020)
summary(uk_romance_model_2020)
saveRDS(uk_romance_model_2020, file = "Tables/uk_romance_model_2020.rds")
```

```{r, results = 'asis', warning = FALSE}
stargazer(uk_romance_model_2020,
          type = "html", 
          title = "British Romance Films for 2020", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating in UK"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## USA Model
```{r scraped USA mode, warning = FALSE}

# USA
dat_2020$USA <- NA
dat_2020$USA[dat_2020$Country=="USA"]<-1
dat_2020$USA[is.na(dat_2020$USA)]<- 0

# USA model
usa_crime_model_2020 <- lm(Rating ~ USA*Topic_9, data = dat_2020)
usa_battle_model_2020 <- lm(Rating ~ USA*Topic_1, data = dat_2020)
summary(usa_crime_model_2020)
summary(usa_battle_model_2020)
saveRDS(usa_crime_model_2020, file = "Tables/usa_crime_model_2020.rds")
saveRDS(usa_battle_model_2020, file = "Tables/usa_battle_model_2020.rds")
# Note: USA-made killer-cop-gang-serial films seem to do  better than the average USA-made film.
```

```{r, results = 'asis', warning = FALSE}
stargazer(usa_crime_model_2020, usa_battle_model_2020,
          type = "html", 
          title = "USA: Crime & Battle Movies for 2020", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Rating of Crime Movies", 
          "Rating of Battle Movies"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```

## Fixed Effect Model
```{r scraped fe, warning = FALSE}
# Fixed effect model
unit_fe_2020 <- lm(Rating ~ Topic_1 + Topic_2 + 
                Topic_3 + Topic_4 + Topic_5 + Topic_6 + 
                Topic_7 + Topic_8 + Topic_9 + Topic_10 +
                as.factor(Country), data = dat_2020)
saveRDS(unit_fe_2020, file = "Tables/unit_fe_2020.rds")
# Note: Overall rating is high at 8.7, but the other estimates are statistically insignificant. 
```

```{r, results = 'asis', warning = FALSE}
stargazer(unit_fe_2020, topics_scraped_model,
          type = "html", 
          title = "Comparison between 2020 Topics Model and 2020 FE Topics Model", 
          omit.stat=c("LL","ser","f","adj.rsq"), 
          dep.var.labels.include = FALSE,
          column.labels = c("Topics Model", "FE Topics Model"),
          header = FALSE,
          single.row = TRUE, 
          column.sep.width = "3pt")
```