Predicting the Success of A Film

In [64]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)
df = pd.read_csv('movies_with_sentiment.csv', thousands = ",")

Convert to DF and Convert Strings to Numerical Values

In [65]:
array = df['released_month']
cats= pd.Categorical(array)
df['released_month1'] = cats.codes

array = df['genre']
cats= pd.Categorical(array)
df['genre1'] = cats.codes

array = df['rated']
cats= pd.Categorical(array)
df['rated1'] = cats.codes

array = df['director']
cats= pd.Categorical(array)
df['director1'] = cats.codes

array = df['writer']
cats= pd.Categorical(array)
df['writer1'] = cats.codes

array = df['main_actor']
cats= pd.Categorical(array)
df['main_actor1'] = cats.codes

array = df['main_actor']
cats= pd.Categorical(array)
df['main_actor1'] = cats.codes

Replace Null Values with the Median

In [66]:
median = df[df['metascore'].notnull()]['metascore'].median()
df['metascore1'] = df['metascore']
df.loc[df['metascore'].isnull(), 'metascore1'] = median

median = df[df['rating'].notnull()]['rating'].median()
df['rating1'] = df['rating']
df.loc[df['rating'].isnull(), 'rating1'] = median

median = df[df['votes'].notnull()]['votes'].median()
df['votes1'] = df['votes']
df.loc[df['votes'].isnull(), 'votes1'] = median

df['budget1'] = df['budget']

Caculate Profit Ratio to Label Success of Film

In [67]:
df['profit_ratio'] = df['gross']/df['budget']
df['profit_ratio'] = df['profit_ratio']
In [68]:
# define successful movie as earning net profit = three times its budget
def isSuccess(x):
    if x >= 3:
        return 1
    else:
        return 0

df['successful'] = df['profit_ratio'].apply(isSuccess)
df2 = df[df.columns[16:]]
df2.head()
Out[68]:
sentiment released_month1 genre1 rated1 director1 writer1 main_actor1 metascore1 rating1 votes1 budget1 profit_ratio successful
0 11 3 7 5 262 176 63 46 41 209678 40000000 4.154181 1
1 33 9 1 4 53 160 81 74 86 845024 165000000 1.139515 0
2 8 4 0 5 193 222 54 51 54 35637 70000000 0.101388 0
3 0 9 3 5 149 155 89 67 66 7698 5000000 0.625788 0
4 0 1 5 5 145 243 59 51 61 6968 1000000 0.023400 0

Sorting out data for 2014-2016 from Current Films

In [69]:
# data
data = df2.values[:327]

# 7 movies that opened March 17, 2016 with no gross, rating, metascore, and votes
opening_movies = df2.values[327:, 0:11]

Create Our Test Sets

In [70]:
from sklearn.cross_validation import train_test_split
    
X_train, X_test, y_train, y_test = train_test_split(data[:, 0:11], data[:, 12], test_size=0.33, random_state=2)

Apply Different Classification Anaylsis Algorithms to Test Sets

Naive Bayes

In [71]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
clf_NB = GaussianNB()
nb_dec = clf_NB.fit(X_train, y_train)
nb_score = clf_NB.predict_proba(X_test)[:,1]
#nb_score = nb_dec.decision_function(X_test)
output_NB = clf_NB.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_NB = accuracy_score(y_test, output_NB)
accuracy_NB

print classification_report(y_test,output_NB)
             precision    recall  f1-score   support

        0.0       0.80      1.00      0.89        86
        1.0       0.00      0.00      0.00        22

avg / total       0.63      0.80      0.71       108

SVM

In [72]:
from sklearn.svm import SVC
clf_svm = SVC()
svm_dec = clf_svm.fit(X_train, y_train)
svm_score = svm_dec.decision_function(X_test)
output_svm = clf_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, output_svm)
accuracy_svm

print classification_report(y_test,output_svm)
             precision    recall  f1-score   support

        0.0       0.80      1.00      0.89        86
        1.0       0.00      0.00      0.00        22

avg / total       0.63      0.80      0.71       108

Logistic Regression

In [73]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
lr_score = clf_lr.fit(X_train, y_train)
lr_score = lr_score.decision_function(X_test)
output_lr = clf_lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, output_lr)
accuracy_lr

print classification_report(y_test,output_lr)
             precision    recall  f1-score   support

        0.0       0.89      0.97      0.93        86
        1.0       0.80      0.55      0.65        22

avg / total       0.87      0.88      0.87       108

Random Forest

In [74]:
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(n_estimators = 100, max_features = 5)
rf_dec = clf_forest.fit(X_train, y_train)
rf_score = clf_forest.predict_proba(X_test)[:,1]
output_forest = clf_forest.predict(X_test)
accuracy_forest = accuracy_score(y_test, output_forest)
accuracy_forest

print classification_report(y_test,output_forest)
             precision    recall  f1-score   support

        0.0       0.79      0.99      0.88        86
        1.0       0.00      0.00      0.00        22

avg / total       0.63      0.79      0.70       108

Opening Movie Gross Profit Predictions

In [75]:
clf_lr.predict(opening_movies)
Out[75]:
array([ 0.,  0.,  0.,  1.,  1.,  1.,  0.])
Movie Budget Sentiment Success
The Divergent Series: Allegiant 115M 40 0
Miracles From Heaven 13M 41 0
Midnight Special 18M 29 0
The Bronze 3.5M -9 1
The Program 1.7M 9 1
My Golden Days 1.7M 26 1
Batman v Superman: Dawn of Justice 250M 23 0