Skip to content

Commit

Permalink
Update preprocessing
Browse files Browse the repository at this point in the history
Change "vote_average" column to be discreet 0 or 1, according to whether the movie will be a blockbuster (initial vote_average >= 6.5, final = 1) or a failure (final vote_average = 0)
  • Loading branch information
Costopoulos committed Aug 19, 2022
1 parent 716d4c8 commit 7e1567c
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 24 deletions.
22 changes: 11 additions & 11 deletions dataPlot.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
# Global variables
# ************************************************

MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime",
"vote_average")
MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime")
HIT_THRESHOLD <- 6.5

# ************************************************
Expand Down Expand Up @@ -107,13 +106,12 @@ plotHistograms<-function(df, indivPlots = FALSE){

# Plot histograms for each field
p1 <- plotHistogram(df=df, field="vote_count", binWidth=1000, indivPlots=indivPlots)
p2 <- plotHistogram(df=df, field="vote_average", binWidth=1, indivPlots=indivPlots)
p3 <- plotHistogram(df=df, field="runtime", binWidth=100, indivPlots=indivPlots)
p4 <- plotHistogram(df=df, field="revenue", binWidth=100000000, indivPlots=indivPlots)
p5 <- plotHistogram(df=df, field="budget", binWidth=20000000, indivPlots=indivPlots)
p6 <- plotHistogram(df=df, field="popularity", binWidth=30, indivPlots=indivPlots)
p2 <- plotHistogram(df=df, field="runtime", binWidth=100, indivPlots=indivPlots)
p3 <- plotHistogram(df=df, field="revenue", binWidth=100000000, indivPlots=indivPlots)
p4 <- plotHistogram(df=df, field="budget", binWidth=20000000, indivPlots=indivPlots)
p5 <- plotHistogram(df=df, field="popularity", binWidth=30, indivPlots=indivPlots)

grid.arrange(p1, p2, p3, p4, p5, p6)
grid.arrange(p1, p2, p3, p4, p5)
}

# ************************************************
Expand All @@ -130,11 +128,11 @@ plotMeanHitFlopValues<-function(df){
df <- df[, !(names(df) %in% TO_DROP)]

# Create a data frame for the mean hit values of all numeric columns in df
hits <- as.data.frame(lapply(list(df), function(x)x[x$vote_average>=HIT_THRESHOLD,]))
hits <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="1",]))
mean_hits <- colMeans(hits[sapply(hits, is.numeric)])

# Create a data frame for the mean flop values of all numeric columns in df
flops <- as.data.frame(lapply(list(df), function(x)x[x$vote_average<HIT_THRESHOLD,]))
flops <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="0",]))
mean_flops <- colMeans(flops[sapply(flops, is.numeric)])

combined_means <- data.frame(cbind(mean_flops, mean_hits))
Expand All @@ -144,10 +142,12 @@ plotMeanHitFlopValues<-function(df){
# Add labels to the columns
names(combined_means)[1]<-"Mean flop value"
names(combined_means)[2]<-"Mean hit value"
# Delete the vote_average row
combined_means<-combined_means[-5,]
combined_means <- cbind("Field" = rownames(combined_means), as.data.frame(combined_means))
rownames(combined_means) <- c()

t<-formattable(combined_means, align=c("l",rep("r", NCOL(combined_means)-1)))
t<-formattable(combined_means, align=c("l",rep("r", NCOL(combined_means))))
print(t)
}

6 changes: 3 additions & 3 deletions helperMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ metricsToRealWorld<-function(dataset,measures,natural){

#target is encoded as: 1 = Flop, 0 = Hit
#Calculate class balance ratio, rm = flop / hit
ClassHit<-length(which(dataset[,positionClassOutput]>=FACTORED_HIT_THRESHOLD))
ClassFlop<-length(which(dataset[,positionClassOutput]<FACTORED_HIT_THRESHOLD))
browser()
ClassHit<-length(which(dataset[,positionClassOutput]==1))
ClassFlop<-length(which(dataset[,positionClassOutput]==0))

classBalance<-ClassFlop/ClassHit
print(paste("Class balance, flop:hit=",round(classBalance,digits=2)))

Expand Down
22 changes: 12 additions & 10 deletions main.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ PROBLEMATIC_FIELDS <- c("belongs_to_collection","homepage","tagline")

UNUSED_FIELDS <- c("homepage","id","imdb_id","original_title",
"belongs_to_collection","overview", "poster_path",
"production_country","release_date", "spoken_languages",
"production_countries","release_date", "spoken_languages",
"status","tagline","title","video")
SYMBOLIC_FIELDS <- c("adult","genres","original_language","production_companies")
ORDINAL_FIELDS <- c("popularity","runtime","vote_average")
SYMBOLIC_FIELDS <- c("adult","genres","original_language",
"production_companies","vote_average")
ORDINAL_FIELDS <- c("popularity","runtime") # vote_average was ordinal, but
# I changed it to ease the modelling
DISCREET_FIELDS <- c("budget","revenue","vote_count")

TYPE_DISCREET <- "DISCREET" # field is discreet (numeric)
Expand Down Expand Up @@ -104,14 +106,14 @@ runModels<-function(dataset, normalized_dataset){
allResults<-realWorldMetrics(dataset, RFmeasures, allResults, "RF")

# # Uncomment if we want to experiment with more DTs:
# # allResults <- runDTModels(dataset, allResults)
#
# allResults <- runDTModels(dataset, allResults)

# # ************************************************
# # Modelling: KNN
# # ************************************************
#
# # Use stratified k-fold cross-validation with the KNN algorithm
# KNNmeasures<-runExperiment(dataset = normalised_dataset,FUN = knnModel)
# KNNmeasures<-runExperiment(dataset = normalized_dataset,FUN = knnModel)
# allResults<-cbind(allResults,data.frame(KNN=unlist(KNNmeasures)))
# allResults<-realWorldMetrics(dataset, KNNmeasures, allResults, "KNN")

Expand Down Expand Up @@ -151,9 +153,9 @@ main<-function(){

# Run experiments on the models
allResults <- runModels(dataset_normalized, normalized_dataset=dataset_normalized)
#
# # Perform evaluation
# runEvaluation(allResults)

# Perform evaluation
runEvaluation(allResults)
}

# ************************************************
Expand All @@ -177,7 +179,7 @@ pacman::p_load(char=LIBRARIES,install=TRUE,character.only=TRUE)
source("preprocessing.R")
source("dataPlot.R")
source("stratifiedKFold.R")
# source("KNNFunctions.R")
source("knn.R")
source("forest.R")
source("helperMethods.R")
source("evaluation.R")
Expand Down
28 changes: 28 additions & 0 deletions preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,31 @@ convertTypes<-function(df) {
return(df)
}

# *******************************************************
# hitFlop() :
#
# Converts "vote_average" column to 0 or 1, if the
# value is equal or more than HIT_THRESHOLD
#
# INPUT: data frame - df - dataframe from the dataset
#
# OUTPUT : data frame - df with updated vote_average col
# *******************************************************
hitFlop<-function(df) {
for (i in 1:nrow(df))
{
if (df[i,"vote_average"] >= HIT_THRESHOLD)
{
df[i,"vote_average"]<-1
}
else
{
df[i,"vote_average"]<-0
}
}
return(df)
}

# ************************************************
# initialPreprocessing() :
#
Expand All @@ -115,6 +140,9 @@ initialPreprocessing<-function(datasetFile){
# Update datatypes
movies <- convertTypes(movies)

# Movie is a hit if score is >= 6.5
movies <- hitFlop(movies)

# Remove any duplicate movies
movies <- movies[!duplicated(movies[c("original_title","id","imdb_id")]),]

Expand Down

0 comments on commit 7e1567c

Please sign in to comment.