Update preprocessing

Change "vote_average" column to be discreet 0 or 1, according to whether the movie will be a blockbuster (initial vote_average >= 6.5, final = 1) or a failure (final vote_average = 0)
Costopoulos · Aug 19, 2022 · 7e1567c · 7e1567c
1 parent 716d4c8
commit 7e1567c
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 24 deletions.
diff --git a/dataPlot.R b/dataPlot.R
@@ -20,8 +20,7 @@
 # Global variables
 # ************************************************
 
-MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime",
-                        "vote_average")
+MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime")
 HIT_THRESHOLD <- 6.5
 
 # ************************************************
@@ -107,13 +106,12 @@ plotHistograms<-function(df, indivPlots = FALSE){
 
   # Plot histograms for each field
   p1 <- plotHistogram(df=df, field="vote_count", binWidth=1000, indivPlots=indivPlots)
-  p2 <- plotHistogram(df=df, field="vote_average", binWidth=1, indivPlots=indivPlots)
-  p3 <- plotHistogram(df=df, field="runtime", binWidth=100, indivPlots=indivPlots)
-  p4 <- plotHistogram(df=df, field="revenue", binWidth=100000000, indivPlots=indivPlots)
-  p5 <- plotHistogram(df=df, field="budget", binWidth=20000000, indivPlots=indivPlots)
-  p6 <- plotHistogram(df=df, field="popularity", binWidth=30, indivPlots=indivPlots)
+  p2 <- plotHistogram(df=df, field="runtime", binWidth=100, indivPlots=indivPlots)
+  p3 <- plotHistogram(df=df, field="revenue", binWidth=100000000, indivPlots=indivPlots)
+  p4 <- plotHistogram(df=df, field="budget", binWidth=20000000, indivPlots=indivPlots)
+  p5 <- plotHistogram(df=df, field="popularity", binWidth=30, indivPlots=indivPlots)
 
-  grid.arrange(p1, p2, p3, p4, p5, p6)
+  grid.arrange(p1, p2, p3, p4, p5)
 }
 
 # ************************************************
@@ -130,11 +128,11 @@ plotMeanHitFlopValues<-function(df){
   df <- df[, !(names(df) %in% TO_DROP)]
 
   # Create a data frame for the mean hit values of all numeric columns in df
-  hits <- as.data.frame(lapply(list(df), function(x)x[x$vote_average>=HIT_THRESHOLD,]))
+  hits <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="1",]))
   mean_hits <- colMeans(hits[sapply(hits, is.numeric)])
 
   # Create a data frame for the mean flop values of all numeric columns in df
-  flops <- as.data.frame(lapply(list(df), function(x)x[x$vote_average<HIT_THRESHOLD,]))
+  flops <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="0",]))
   mean_flops <- colMeans(flops[sapply(flops, is.numeric)])
 
   combined_means <- data.frame(cbind(mean_flops, mean_hits))
@@ -144,10 +142,12 @@ plotMeanHitFlopValues<-function(df){
   # Add labels to the columns
   names(combined_means)[1]<-"Mean flop value"
   names(combined_means)[2]<-"Mean hit value"
+  # Delete the vote_average row
+  combined_means<-combined_means[-5,]
   combined_means <- cbind("Field" = rownames(combined_means), as.data.frame(combined_means))
   rownames(combined_means) <- c()
 
-  t<-formattable(combined_means, align=c("l",rep("r", NCOL(combined_means)-1)))
+  t<-formattable(combined_means, align=c("l",rep("r", NCOL(combined_means))))
   print(t)
 }
 
diff --git a/helperMethods.R b/helperMethods.R
@@ -110,9 +110,9 @@ metricsToRealWorld<-function(dataset,measures,natural){
 
   #target is encoded as: 1 = Flop, 0 = Hit
   #Calculate class balance ratio, rm = flop / hit
-  ClassHit<-length(which(dataset[,positionClassOutput]>=FACTORED_HIT_THRESHOLD))
-  ClassFlop<-length(which(dataset[,positionClassOutput]<FACTORED_HIT_THRESHOLD))
-  browser()
+  ClassHit<-length(which(dataset[,positionClassOutput]==1))
+  ClassFlop<-length(which(dataset[,positionClassOutput]==0))
+
   classBalance<-ClassFlop/ClassHit
   print(paste("Class balance, flop:hit=",round(classBalance,digits=2)))
 

diff --git a/main.R b/main.R
@@ -36,10 +36,12 @@ PROBLEMATIC_FIELDS <- c("belongs_to_collection","homepage","tagline")
 
 UNUSED_FIELDS     <- c("homepage","id","imdb_id","original_title",
                        "belongs_to_collection","overview", "poster_path",
-                       "production_country","release_date", "spoken_languages",
+                       "production_countries","release_date", "spoken_languages",
                        "status","tagline","title","video")
-SYMBOLIC_FIELDS   <- c("adult","genres","original_language","production_companies")
-ORDINAL_FIELDS    <- c("popularity","runtime","vote_average")
+SYMBOLIC_FIELDS   <- c("adult","genres","original_language",
+                       "production_companies","vote_average")
+ORDINAL_FIELDS    <- c("popularity","runtime") # vote_average was ordinal, but
+                                               # I changed it to ease the modelling
 DISCREET_FIELDS   <- c("budget","revenue","vote_count")
 
 TYPE_DISCREET     <- "DISCREET"           # field is discreet (numeric)
@@ -104,14 +106,14 @@ runModels<-function(dataset, normalized_dataset){
   allResults<-realWorldMetrics(dataset, RFmeasures, allResults, "RF")
 
   # # Uncomment if we want to experiment with more DTs:
-  # # allResults <- runDTModels(dataset, allResults)
-  # 
+  # allResults <- runDTModels(dataset, allResults)
+
   # # ************************************************
   # # Modelling: KNN
   # # ************************************************
   # 
   # # Use stratified k-fold cross-validation with the KNN algorithm
-  # KNNmeasures<-runExperiment(dataset = normalised_dataset,FUN = knnModel)
+  # KNNmeasures<-runExperiment(dataset = normalized_dataset,FUN = knnModel)
   # allResults<-cbind(allResults,data.frame(KNN=unlist(KNNmeasures)))
   # allResults<-realWorldMetrics(dataset, KNNmeasures, allResults, "KNN")
 
@@ -151,9 +153,9 @@ main<-function(){
 
   # Run experiments on the models
   allResults <- runModels(dataset_normalized, normalized_dataset=dataset_normalized)
-  # 
-  # # Perform evaluation
-  # runEvaluation(allResults)
+
+  # Perform evaluation
+  runEvaluation(allResults)
 }
 
 # ************************************************
@@ -177,7 +179,7 @@ pacman::p_load(char=LIBRARIES,install=TRUE,character.only=TRUE)
 source("preprocessing.R")
 source("dataPlot.R")
 source("stratifiedKFold.R")
-# source("KNNFunctions.R")
+source("knn.R")
 source("forest.R")
 source("helperMethods.R")
 source("evaluation.R")

diff --git a/preprocessing.R b/preprocessing.R
@@ -90,6 +90,31 @@ convertTypes<-function(df) {
   return(df)
 }
 
+# *******************************************************
+# hitFlop() :
+#
+# Converts "vote_average" column to 0 or 1, if the 
+# value is equal or more than HIT_THRESHOLD
+#
+# INPUT: data frame - df - dataframe from the dataset
+#
+# OUTPUT : data frame - df with updated vote_average col
+# *******************************************************
+hitFlop<-function(df) {
+  for (i in 1:nrow(df))
+  {
+    if (df[i,"vote_average"] >= HIT_THRESHOLD)
+    {
+      df[i,"vote_average"]<-1
+    }
+    else
+    {
+      df[i,"vote_average"]<-0
+    }
+  }
+  return(df)
+}
+
 # ************************************************
 # initialPreprocessing() :
 #
@@ -115,6 +140,9 @@ initialPreprocessing<-function(datasetFile){
   # Update datatypes
   movies <- convertTypes(movies)
 
+  # Movie is a hit if score is >= 6.5
+  movies <- hitFlop(movies)
+
   # Remove any duplicate movies
   movies <- movies[!duplicated(movies[c("original_title","id","imdb_id")]),]