Skip to content

Commit

Permalink
Major code refactoring (main&dataPlot)
Browse files Browse the repository at this point in the history
  • Loading branch information
Costopoulos committed Aug 22, 2022
1 parent d412ef9 commit 0793406
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 55 deletions.
106 changes: 59 additions & 47 deletions dataPlot.R
Original file line number Diff line number Diff line change
@@ -1,40 +1,41 @@
# ************************************************
# PRATICAL BUSINESS ANALYTICS 2021
# PRACTICAL BUSINESS ANALYTICS 2021
# COM3018
#
# Marilia Sinopli
# University of Surrey
# GUILDFORD
# Surrey GU2 7XH
#
# 13 NOVEMBER 2021
#
# UPDATE
# 1.00 13/11/2021 Initial plotHistograms() (Dominic Adams)
# 1.01 17/11/2021 Add plotAllMeanGraphs() (Dominic Adams)
# 1.02 18/11/2021 Add plotHistograms() (Dominic Adams)
# 1.02 20/11/2021 Add plotMeanHitFlopValues() (Dominic Adams)
# ************************************************

# ************************************************
# Global variables
# visualizeData() :
#
# Create all data plots
#
# INPUT: data frame - df - movies' data frame
# list - config - list of configurations
# bool - rangeBars - If true, plot mean values with standard deviation
# num - yLine - Point in y axis to intercept the plot
# bool - indivPlots - If true, plots graphs separately too
# ************************************************

MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime")
HIT_THRESHOLD <- 6.5
visualizeData<-function(df, config, rangeBars = FALSE, yLine = 0, indivPlots = FALSE) {
visualizeMeanGraphs(df, config, rangeBars, yLine, indivPlots) # Mean values across the years
visualizeHistograms(df) # Distribution of field values for successful and fail movies
visualizeMeanSuccessFailureValues(df) # Table with mean field values for successful and fail movies
}

# ************************************************
# plotAllMeanGraphs() :
# visualizeMeanGraphs() :
#
# Plot line graphs to see how the mean values change over time
# Plot line graphs to show the change of mean values over time
#
# INPUT: data frame - df - movies' data frame
# INPUT: data frame - df - movies' data frame
# list - config - list of configurations
# bool - rangeBars - If true, plot mean values with standard deviation
# num - yLine - Point in y axis to intercept the plot
# bool - indivPlots - If true, plots graphs separately too
# ************************************************
plotAllMeanGraphs<-function(df, rangeBars = FALSE, yLine = 0, indivPlots = FALSE){
visualizeMeanGraphs<-function(df, config, rangeBars = FALSE, yLine = 0, indivPlots = FALSE){
p <- list()
for(i in 1:length(MEAN_WORTHY_FIELDS)){
p[[i]] <- ggplot(data=df, aes_string(y=MEAN_WORTHY_FIELDS[i], x="release_date", group=1)) +
ggtitle(paste("Mean values for", MEAN_WORTHY_FIELDS[i])) +
for(i in 1:length(config$MEAN_WORTHY_FIELDS)){
p[[i]] <- ggplot(data=df, aes_string(y=config$MEAN_WORTHY_FIELDS[i], x="release_date", group=1)) +
ggtitle(paste("Mean values for", config$MEAN_WORTHY_FIELDS[i])) +
scale_x_continuous(name="Release date") +
theme(
plot.title = element_text(color="black", size=11, face="bold"),
Expand All @@ -44,20 +45,21 @@ plotAllMeanGraphs<-function(df, rangeBars = FALSE, yLine = 0, indivPlots = FALSE
axis.title.y = element_text(color="#993333", size=11, face="bold")
)

# Can add range bars showing mean value +/- 1 standard deviation as max/min values
# Range bar showcasing max value as mean+1 standard deviation and min as
# mean-1 standard deviation
if (rangeBars == TRUE) {
p[[i]] <- p[[i]] + stat_summary(aes_string(y = MEAN_WORTHY_FIELDS[i],group=1), fun=mean, fun.min = function(x) mean(x) - sd(x),
p[[i]] <- p[[i]] + stat_summary(aes_string(y = config$MEAN_WORTHY_FIELDS[i],group=1), fun=mean, fun.min = function(x) mean(x) - sd(x),
fun.max = function(x) mean(x) + sd(x), colour="black", geom = "pointrange",group=1)
} else {
p[[i]] <- p[[i]] + stat_summary(aes_string(y = MEAN_WORTHY_FIELDS[i],group=1), fun=mean, colour="black", geom = "line",group=1)
p[[i]] <- p[[i]] + stat_summary(aes_string(y = config$MEAN_WORTHY_FIELDS[i],group=1), fun=mean, colour="black", geom = "line",group=1)
}

# Can add a vertical line on to the graph at a specific year
# Draw a vertical line at a specific point of time in the y axis
if (yLine != 0) {
p[[i]] <- p[[i]] + geom_vline(xintercept = yLine, color = "red")
}

# Can plot each graph to its own plot in addition to the grouped plot display
# Plot graphs individually
if (indivPlots == TRUE) {
plot(p[[i]])
}
Expand All @@ -70,13 +72,16 @@ plotAllMeanGraphs<-function(df, rangeBars = FALSE, yLine = 0, indivPlots = FALSE
#
# Plots an individual histogram for the input field and bin width
#
# INPUT: data frame - df - data frame of movies
# INPUT: data frame - df - data frame of movies
# various - field - field to plot histogram for
# num - binWidth - width of the bars
# bool - indivPlots - If true, plots graphs separately too
#
# OUTPUT: ggplot histogram - histogram object which can be printed
# ************************************************
plotHistogram<-function(df, field, binWidth = 30, indivPlots = FALSE){
p<-ggplot(df, aes_string(x=field, fill="vote_average", color="vote_average")) +
ggtitle(paste(field, "histogram")) +
ggtitle(paste(field, "histogram")) + #(xlim(0,NA)) +
geom_histogram(binwidth = binWidth,position="identity",aes_string(fill="vote_average"),alpha=0.5)+
scale_color_manual(values=c("#999999", "#E69F00"))+
scale_fill_manual(values=c("#999999", "#E69F00"))+
Expand All @@ -88,6 +93,7 @@ plotHistogram<-function(df, field, binWidth = 30, indivPlots = FALSE){
axis.title.y = element_text(color="#993333", size=11, face="bold")
)

# Plot graphs individually
if (indivPlots == TRUE) {
plot(p)
}
Expand All @@ -96,13 +102,13 @@ plotHistogram<-function(df, field, binWidth = 30, indivPlots = FALSE){
}

# ************************************************
# plotAllHistograms() :
# visualizeHistograms() :
#
# Plot histograms to see the frequency distribution of each field in the dataset
#
# INPUT: data frame - df - movies' dataset
# ************************************************
plotHistograms<-function(df, indivPlots = FALSE){
visualizeHistograms<-function(df, indivPlots = FALSE){

# Plot histograms for each field
p1 <- plotHistogram(df=df, field="vote_count", binWidth=1000, indivPlots=indivPlots)
Expand All @@ -111,37 +117,43 @@ plotHistograms<-function(df, indivPlots = FALSE){
p4 <- plotHistogram(df=df, field="budget", binWidth=20000000, indivPlots=indivPlots)
p5 <- plotHistogram(df=df, field="popularity", binWidth=30, indivPlots=indivPlots)

# # Plot histograms for each field
# p1 <- plotHistogram(df=df, field="vote_count")
# p2 <- plotHistogram(df=df, field="runtime")
# p3 <- plotHistogram(df=df, field="revenue")
# p4 <- plotHistogram(df=df, field="budget")
# p5 <- plotHistogram(df=df, field="popularity")

grid.arrange(p1, p2, p3, p4, p5)
}

# ************************************************
# plotMeanHitFlopValues() :
# visualizeMeanSuccessFailureValues() :
#
# Plot a table showing the mean field values for hit and flop movies
# Table with mean field values for successful and fail movies
#
# INPUT: data frame - df - movies' data frame
# ************************************************
plotMeanHitFlopValues<-function(df){

visualizeMeanSuccessFailureValues<-function(df){
# Remove fields that we do not need the mean values for
TO_DROP <- c("release_date")
df <- df[, !(names(df) %in% TO_DROP)]

# Create a data frame for the mean hit values of all numeric columns in df
hits <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="1",]))
mean_hits <- colMeans(hits[sapply(hits, is.numeric)])
# Data frame with the mean success values of all num columns in df
success <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="1",]))
mean_success <- colMeans(success[sapply(success, is.numeric)])

# Create a data frame for the mean flop values of all numeric columns in df
flops <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="0",]))
mean_flops <- colMeans(flops[sapply(flops, is.numeric)])
# Data frame with the mean failure values of all num columns in df
failure <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="0",]))
mean_failure <- colMeans(failure[sapply(failure, is.numeric)])

combined_means <- data.frame(cbind(mean_flops, mean_hits))
combined_means <- data.frame(cbind(mean_failure, mean_success))
# Round all values to two decimal places
combined_means <- round(combined_means, 2)

# Add labels to the columns
names(combined_means)[1]<-"Mean flop value"
names(combined_means)[2]<-"Mean hit value"
names(combined_means)[1]<-"Mean failure value"
names(combined_means)[2]<-"Mean success value"
# Delete the vote_average row
combined_means<-combined_means[-5,]
combined_means <- cbind("Field" = rownames(combined_means), as.data.frame(combined_means))
Expand Down
15 changes: 7 additions & 8 deletions main.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ setConfig<-function() {
# this threshold is considered a
# success

MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime") # Fields to
# get the mean
# values from

# Initialize empty list to keep key-value pairs
config <- list()

Expand All @@ -86,6 +90,7 @@ setConfig<-function() {
config[['NODE_DEPTH']] <- NODE_DEPTH
config[['TREE_NUMBER']] <- TREE_NUMBER
config[['BLOCKBUSTER_THRESHOLD']]<- BLOCKBUSTER_THRESHOLD
config[['MEAN_WORTHY_FIELDS']] <- MEAN_WORTHY_FIELDS

return(config)
}
Expand Down Expand Up @@ -135,7 +140,6 @@ runModels<-function(dataset, normalized_dataset){
#
# INPUT: None
# OUTPUT :None

# ************************************************
main<-function(){
# Get config
Expand All @@ -145,13 +149,8 @@ main<-function(){
movies <- initialPreprocessing(config)
cat("The number of rows in the 'movies' object is now", nrow(movies),"\n")

# Data exploration
#
# If plot(s) are not visible in the RStudio Plots tab, comment out everything
# in main() after plotMeanHitFlopValues(movies) is called
plotAllMeanGraphs(movies, rangeBars = FALSE, yLine=0, indivPlots = FALSE) # Mean values across the years
plotHistograms(movies, indivPlots = FALSE) # Distribution of field values for hits and flops
plotMeanHitFlopValues(movies) # Table showing mean field values for hits and flops
# Visualize data
visualizeData(movies, config)

# Dataset after entire dataset has been preprocessed
datasets <- preprocessing(movies)
Expand Down

0 comments on commit 0793406

Please sign in to comment.