Predicting the Success of A Film¶

import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)
df = pd.read_csv('movies_with_sentiment.csv', thousands = ",")

Convert to DF and Convert Strings to Numerical Values¶

array = df['released_month']
cats= pd.Categorical(array)
df['released_month1'] = cats.codes

array = df['genre']
cats= pd.Categorical(array)
df['genre1'] = cats.codes

array = df['rated']
cats= pd.Categorical(array)
df['rated1'] = cats.codes

array = df['director']
cats= pd.Categorical(array)
df['director1'] = cats.codes

array = df['writer']
cats= pd.Categorical(array)
df['writer1'] = cats.codes

array = df['main_actor']
cats= pd.Categorical(array)
df['main_actor1'] = cats.codes

array = df['main_actor']
cats= pd.Categorical(array)
df['main_actor1'] = cats.codes

Replace Null Values with the Median¶

median = df[df['metascore'].notnull()]['metascore'].median()
df['metascore1'] = df['metascore']
df.loc[df['metascore'].isnull(), 'metascore1'] = median

median = df[df['rating'].notnull()]['rating'].median()
df['rating1'] = df['rating']
df.loc[df['rating'].isnull(), 'rating1'] = median

median = df[df['votes'].notnull()]['votes'].median()
df['votes1'] = df['votes']
df.loc[df['votes'].isnull(), 'votes1'] = median

df['budget1'] = df['budget']

Caculate Profit Ratio to Label Success of Film¶

df['profit_ratio'] = df['gross']/df['budget']
df['profit_ratio'] = df['profit_ratio']

# define successful movie as earning net profit = three times its budget
def isSuccess(x):
    if x >= 3:
        return 1
    else:
        return 0

df['successful'] = df['profit_ratio'].apply(isSuccess)
df2 = df[df.columns[16:]]
df2.head()

Sorting out data for 2014-2016 from Current Films¶

# data
data = df2.values[:327]

# 7 movies that opened March 17, 2016 with no gross, rating, metascore, and votes
opening_movies = df2.values[327:, 0:11]

Create Our Test Sets¶

from sklearn.cross_validation import train_test_split
    
X_train, X_test, y_train, y_test = train_test_split(data[:, 0:11], data[:, 12], test_size=0.33, random_state=2)

Apply Different Classification Anaylsis Algorithms to Test Sets¶

Naive Bayes¶

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
clf_NB = GaussianNB()
nb_dec = clf_NB.fit(X_train, y_train)
nb_score = clf_NB.predict_proba(X_test)[:,1]
#nb_score = nb_dec.decision_function(X_test)
output_NB = clf_NB.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_NB = accuracy_score(y_test, output_NB)
accuracy_NB

print classification_report(y_test,output_NB)

             precision    recall  f1-score   support

        0.0       0.80      1.00      0.89        86
        1.0       0.00      0.00      0.00        22

avg / total       0.63      0.80      0.71       108

SVM¶

from sklearn.svm import SVC
clf_svm = SVC()
svm_dec = clf_svm.fit(X_train, y_train)
svm_score = svm_dec.decision_function(X_test)
output_svm = clf_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, output_svm)
accuracy_svm

print classification_report(y_test,output_svm)

             precision    recall  f1-score   support

        0.0       0.80      1.00      0.89        86
        1.0       0.00      0.00      0.00        22

avg / total       0.63      0.80      0.71       108

Logistic Regression¶

from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
lr_score = clf_lr.fit(X_train, y_train)
lr_score = lr_score.decision_function(X_test)
output_lr = clf_lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, output_lr)
accuracy_lr

print classification_report(y_test,output_lr)

             precision    recall  f1-score   support

        0.0       0.89      0.97      0.93        86
        1.0       0.80      0.55      0.65        22

avg / total       0.87      0.88      0.87       108

Random Forest¶

from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(n_estimators = 100, max_features = 5)
rf_dec = clf_forest.fit(X_train, y_train)
rf_score = clf_forest.predict_proba(X_test)[:,1]
output_forest = clf_forest.predict(X_test)
accuracy_forest = accuracy_score(y_test, output_forest)
accuracy_forest

print classification_report(y_test,output_forest)

             precision    recall  f1-score   support

        0.0       0.79      0.99      0.88        86
        1.0       0.00      0.00      0.00        22

avg / total       0.63      0.79      0.70       108

Opening Movie Gross Profit Predictions¶

clf_lr.predict(opening_movies)

array([ 0.,  0.,  0.,  1.,  1.,  1.,  0.])

Movie	Budget	Sentiment	Success
The Divergent Series: Allegiant	115M	40	0
Miracles From Heaven	13M	41	0
Midnight Special	18M	29	0
The Bronze	3.5M	-9	1
The Program	1.7M	9	1
My Golden Days	1.7M	26	1
Batman v Superman: Dawn of Justice	250M	23	0

	sentiment	released_month1	genre1	rated1	director1	writer1	main_actor1	metascore1	rating1	votes1	budget1	profit_ratio	successful
0	11	3	7	5	262	176	63	46	41	209678	40000000	4.154181	1
1	33	9	1	4	53	160	81	74	86	845024	165000000	1.139515	0
2	8	4	0	5	193	222	54	51	54	35637	70000000	0.101388	0
3	0	9	3	5	149	155	89	67	66	7698	5000000	0.625788	0
4	0	1	5	5	145	243	59	51	61	6968	1000000	0.023400	0