Major code refactoring (main&dataPlot)

Costopoulos · Aug 22, 2022 · 0793406 · 0793406
1 parent d412ef9
commit 0793406
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 55 deletions.
diff --git a/dataPlot.R b/dataPlot.R
@@ -1,40 +1,41 @@
 # ************************************************
-#  PRATICAL BUSINESS ANALYTICS 2021
+#  PRACTICAL BUSINESS ANALYTICS 2021
 #  COM3018
-#
-# Marilia Sinopli
-# University of Surrey
-# GUILDFORD
-# Surrey GU2 7XH
-#
-# 13 NOVEMBER 2021
-#
-# UPDATE
-# 1.00      13/11/2021    Initial plotHistograms()   (Dominic Adams)
-# 1.01      17/11/2021    Add plotAllMeanGraphs()   (Dominic Adams)
-# 1.02      18/11/2021    Add plotHistograms()   (Dominic Adams)
-# 1.02      20/11/2021    Add plotMeanHitFlopValues()   (Dominic Adams)
 # ************************************************
 
 # ************************************************
-# Global variables
+# visualizeData() :
+#
+# Create all data plots
+#
+# INPUT: data frame - df - movies' data frame
+#        list       - config     - list of configurations
+#        bool       - rangeBars  - If true, plot mean values with standard deviation
+#        num        - yLine      - Point in y axis to intercept the plot
+#        bool       - indivPlots - If true, plots graphs separately too
 # ************************************************
-
-MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime")
-HIT_THRESHOLD <- 6.5
+visualizeData<-function(df, config, rangeBars = FALSE, yLine = 0, indivPlots = FALSE) {
+  visualizeMeanGraphs(df, config, rangeBars, yLine, indivPlots) # Mean values across the years
+  visualizeHistograms(df) # Distribution of field values for successful and fail movies
+  visualizeMeanSuccessFailureValues(df) # Table with mean field values for successful and fail movies
+}
 
 # ************************************************
-# plotAllMeanGraphs() :
+# visualizeMeanGraphs() :
 #
-# Plot line graphs to see how the mean values change over time
+# Plot line graphs to show the change of mean values over time
 #
-# INPUT: data frame - df - movies' data frame
+# INPUT: data frame - df         - movies' data frame
+#        list       - config     - list of configurations
+#        bool       - rangeBars  - If true, plot mean values with standard deviation
+#        num        - yLine      - Point in y axis to intercept the plot
+#        bool       - indivPlots - If true, plots graphs separately too
 # ************************************************
-plotAllMeanGraphs<-function(df, rangeBars = FALSE, yLine = 0, indivPlots = FALSE){
+visualizeMeanGraphs<-function(df, config, rangeBars = FALSE, yLine = 0, indivPlots = FALSE){
   p <- list()
-  for(i in 1:length(MEAN_WORTHY_FIELDS)){
-    p[[i]] <- ggplot(data=df, aes_string(y=MEAN_WORTHY_FIELDS[i], x="release_date", group=1)) +
-      ggtitle(paste("Mean values for", MEAN_WORTHY_FIELDS[i])) +
+  for(i in 1:length(config$MEAN_WORTHY_FIELDS)){
+    p[[i]] <- ggplot(data=df, aes_string(y=config$MEAN_WORTHY_FIELDS[i], x="release_date", group=1)) +
+      ggtitle(paste("Mean values for", config$MEAN_WORTHY_FIELDS[i])) +
       scale_x_continuous(name="Release date") +
       theme(
         plot.title = element_text(color="black", size=11, face="bold"),
@@ -44,20 +45,21 @@ plotAllMeanGraphs<-function(df, rangeBars = FALSE, yLine = 0, indivPlots = FALSE
         axis.title.y = element_text(color="#993333", size=11, face="bold")
       )
 
-    # Can add range bars showing mean value +/- 1 standard deviation as max/min values
+    # Range bar showcasing max value as mean+1 standard deviation and min as 
+    # mean-1 standard deviation
     if (rangeBars == TRUE) {
-      p[[i]] <- p[[i]] + stat_summary(aes_string(y = MEAN_WORTHY_FIELDS[i],group=1), fun=mean, fun.min = function(x) mean(x) - sd(x),
+      p[[i]] <- p[[i]] + stat_summary(aes_string(y = config$MEAN_WORTHY_FIELDS[i],group=1), fun=mean, fun.min = function(x) mean(x) - sd(x),
                                       fun.max = function(x) mean(x) + sd(x), colour="black", geom = "pointrange",group=1)
     } else {
-      p[[i]] <- p[[i]] + stat_summary(aes_string(y = MEAN_WORTHY_FIELDS[i],group=1), fun=mean, colour="black", geom = "line",group=1)
+      p[[i]] <- p[[i]] + stat_summary(aes_string(y = config$MEAN_WORTHY_FIELDS[i],group=1), fun=mean, colour="black", geom = "line",group=1)
     }
 
-    # Can add a vertical line on to the graph at a specific year
+    # Draw a vertical line at a specific point of time in the y axis
     if (yLine != 0) {
       p[[i]] <- p[[i]] + geom_vline(xintercept = yLine, color = "red")
     }
 
-    # Can plot each graph to its own plot in addition to the grouped plot display
+    # Plot graphs individually
     if (indivPlots == TRUE) {
       plot(p[[i]])
     }
@@ -70,13 +72,16 @@ plotAllMeanGraphs<-function(df, rangeBars = FALSE, yLine = 0, indivPlots = FALSE
 #
 # Plots an individual histogram for the input field and bin width
 #
-# INPUT: data frame - df - data frame of movies
+# INPUT: data frame - df         - data frame of movies
+#        various    - field      - field to plot histogram for
+#        num        - binWidth   - width of the bars
+#        bool       - indivPlots - If true, plots graphs separately too
 #
 # OUTPUT: ggplot histogram - histogram object which can be printed
 # ************************************************
 plotHistogram<-function(df, field, binWidth = 30, indivPlots = FALSE){
   p<-ggplot(df, aes_string(x=field, fill="vote_average", color="vote_average")) +
-    ggtitle(paste(field, "histogram")) +
+    ggtitle(paste(field, "histogram")) + #(xlim(0,NA)) +
     geom_histogram(binwidth = binWidth,position="identity",aes_string(fill="vote_average"),alpha=0.5)+
     scale_color_manual(values=c("#999999", "#E69F00"))+
     scale_fill_manual(values=c("#999999", "#E69F00"))+
@@ -88,6 +93,7 @@ plotHistogram<-function(df, field, binWidth = 30, indivPlots = FALSE){
       axis.title.y = element_text(color="#993333", size=11, face="bold")
     )
 
+  # Plot graphs individually
   if (indivPlots == TRUE) {
     plot(p)
   }
@@ -96,13 +102,13 @@ plotHistogram<-function(df, field, binWidth = 30, indivPlots = FALSE){
 }
 
 # ************************************************
-# plotAllHistograms() :
+# visualizeHistograms() :
 #
 # Plot histograms to see the frequency distribution of each field in the dataset
 #
 # INPUT: data frame - df - movies' dataset
 # ************************************************
-plotHistograms<-function(df, indivPlots = FALSE){
+visualizeHistograms<-function(df, indivPlots = FALSE){
 
   # Plot histograms for each field
   p1 <- plotHistogram(df=df, field="vote_count", binWidth=1000, indivPlots=indivPlots)
@@ -111,37 +117,43 @@ plotHistograms<-function(df, indivPlots = FALSE){
   p4 <- plotHistogram(df=df, field="budget", binWidth=20000000, indivPlots=indivPlots)
   p5 <- plotHistogram(df=df, field="popularity", binWidth=30, indivPlots=indivPlots)
 
+  # # Plot histograms for each field
+  # p1 <- plotHistogram(df=df, field="vote_count")
+  # p2 <- plotHistogram(df=df, field="runtime")
+  # p3 <- plotHistogram(df=df, field="revenue")
+  # p4 <- plotHistogram(df=df, field="budget")
+  # p5 <- plotHistogram(df=df, field="popularity")
+
   grid.arrange(p1, p2, p3, p4, p5)
 }
 
 # ************************************************
-# plotMeanHitFlopValues() :
+# visualizeMeanSuccessFailureValues() :
 #
-# Plot a table showing the mean field values for hit and flop movies
+# Table with mean field values for successful and fail movies
 #
 # INPUT: data frame - df - movies' data frame
 # ************************************************
-plotMeanHitFlopValues<-function(df){
-
+visualizeMeanSuccessFailureValues<-function(df){
   # Remove fields that we do not need the mean values for
   TO_DROP <- c("release_date")
   df <- df[, !(names(df) %in% TO_DROP)]
 
-  # Create a data frame for the mean hit values of all numeric columns in df
-  hits <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="1",]))
-  mean_hits <- colMeans(hits[sapply(hits, is.numeric)])
+  # Data frame with the mean success values of all num columns in df
+  success <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="1",]))
+  mean_success <- colMeans(success[sapply(success, is.numeric)])
 
-  # Create a data frame for the mean flop values of all numeric columns in df
-  flops <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="0",]))
-  mean_flops <- colMeans(flops[sapply(flops, is.numeric)])
+  # Data frame with the mean failure values of all num columns in df
+  failure <- as.data.frame(lapply(list(df), function(x)x[x$vote_average=="0",]))
+  mean_failure <- colMeans(failure[sapply(failure, is.numeric)])
 
-  combined_means <- data.frame(cbind(mean_flops, mean_hits))
+  combined_means <- data.frame(cbind(mean_failure, mean_success))
   # Round all values to two decimal places
   combined_means <- round(combined_means, 2)
 
   # Add labels to the columns
-  names(combined_means)[1]<-"Mean flop value"
-  names(combined_means)[2]<-"Mean hit value"
+  names(combined_means)[1]<-"Mean failure value"
+  names(combined_means)[2]<-"Mean success value"
   # Delete the vote_average row
   combined_means<-combined_means[-5,]
   combined_means <- cbind("Field" = rownames(combined_means), as.data.frame(combined_means))

diff --git a/main.R b/main.R
@@ -65,6 +65,10 @@ setConfig<-function() {
                                               # this threshold is considered a 
                                               # success
 
+  MEAN_WORTHY_FIELDS <- c("popularity","budget","revenue","runtime") # Fields to
+                                                                     # get the mean
+                                                                     # values from
+
   # Initialize empty list to keep key-value pairs
   config <- list()
 
@@ -86,6 +90,7 @@ setConfig<-function() {
   config[['NODE_DEPTH']]           <- NODE_DEPTH
   config[['TREE_NUMBER']]          <- TREE_NUMBER
   config[['BLOCKBUSTER_THRESHOLD']]<- BLOCKBUSTER_THRESHOLD
+  config[['MEAN_WORTHY_FIELDS']]   <- MEAN_WORTHY_FIELDS
 
   return(config)
 }
@@ -135,7 +140,6 @@ runModels<-function(dataset, normalized_dataset){
 #
 # INPUT: None
 # OUTPUT :None
-
 # ************************************************
 main<-function(){
   # Get config
@@ -145,13 +149,8 @@ main<-function(){
   movies <- initialPreprocessing(config)
   cat("The number of rows in the 'movies' object is now", nrow(movies),"\n")
 
-  # Data exploration
-  #
-  # If plot(s) are not visible in the RStudio Plots tab, comment out everything 
-  # in main() after plotMeanHitFlopValues(movies) is called
-  plotAllMeanGraphs(movies, rangeBars = FALSE, yLine=0, indivPlots = FALSE) # Mean values across the years
-  plotHistograms(movies, indivPlots = FALSE) # Distribution of field values for hits and flops
-  plotMeanHitFlopValues(movies) # Table showing mean field values for hits and flops
+  # Visualize data
+  visualizeData(movies, config)
 
   # Dataset after entire dataset has been preprocessed
   datasets <- preprocessing(movies)