Data Anaylsis
Predicting the Success of A Film¶
In [64]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)
df = pd.read_csv('movies_with_sentiment.csv', thousands = ",")
Convert to DF and Convert Strings to Numerical Values¶
In [65]:
array = df['released_month']
cats= pd.Categorical(array)
df['released_month1'] = cats.codes
array = df['genre']
cats= pd.Categorical(array)
df['genre1'] = cats.codes
array = df['rated']
cats= pd.Categorical(array)
df['rated1'] = cats.codes
array = df['director']
cats= pd.Categorical(array)
df['director1'] = cats.codes
array = df['writer']
cats= pd.Categorical(array)
df['writer1'] = cats.codes
array = df['main_actor']
cats= pd.Categorical(array)
df['main_actor1'] = cats.codes
array = df['main_actor']
cats= pd.Categorical(array)
df['main_actor1'] = cats.codes
Replace Null Values with the Median¶
In [66]:
median = df[df['metascore'].notnull()]['metascore'].median()
df['metascore1'] = df['metascore']
df.loc[df['metascore'].isnull(), 'metascore1'] = median
median = df[df['rating'].notnull()]['rating'].median()
df['rating1'] = df['rating']
df.loc[df['rating'].isnull(), 'rating1'] = median
median = df[df['votes'].notnull()]['votes'].median()
df['votes1'] = df['votes']
df.loc[df['votes'].isnull(), 'votes1'] = median
df['budget1'] = df['budget']
Caculate Profit Ratio to Label Success of Film¶
In [67]:
df['profit_ratio'] = df['gross']/df['budget']
df['profit_ratio'] = df['profit_ratio']
In [68]:
# define successful movie as earning net profit = three times its budget
def isSuccess(x):
if x >= 3:
return 1
else:
return 0
df['successful'] = df['profit_ratio'].apply(isSuccess)
df2 = df[df.columns[16:]]
df2.head()
Out[68]:
Sorting out data for 2014-2016 from Current Films¶
In [69]:
# data
data = df2.values[:327]
# 7 movies that opened March 17, 2016 with no gross, rating, metascore, and votes
opening_movies = df2.values[327:, 0:11]
Create Our Test Sets¶
In [70]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[:, 0:11], data[:, 12], test_size=0.33, random_state=2)
Apply Different Classification Anaylsis Algorithms to Test Sets¶
Naive Bayes¶
In [71]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
clf_NB = GaussianNB()
nb_dec = clf_NB.fit(X_train, y_train)
nb_score = clf_NB.predict_proba(X_test)[:,1]
#nb_score = nb_dec.decision_function(X_test)
output_NB = clf_NB.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_NB = accuracy_score(y_test, output_NB)
accuracy_NB
print classification_report(y_test,output_NB)
SVM¶
In [72]:
from sklearn.svm import SVC
clf_svm = SVC()
svm_dec = clf_svm.fit(X_train, y_train)
svm_score = svm_dec.decision_function(X_test)
output_svm = clf_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, output_svm)
accuracy_svm
print classification_report(y_test,output_svm)
Logistic Regression¶
In [73]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
lr_score = clf_lr.fit(X_train, y_train)
lr_score = lr_score.decision_function(X_test)
output_lr = clf_lr.predict(X_test)
accuracy_lr = accuracy_score(y_test, output_lr)
accuracy_lr
print classification_report(y_test,output_lr)
Random Forest¶
In [74]:
from sklearn.ensemble import RandomForestClassifier
clf_forest = RandomForestClassifier(n_estimators = 100, max_features = 5)
rf_dec = clf_forest.fit(X_train, y_train)
rf_score = clf_forest.predict_proba(X_test)[:,1]
output_forest = clf_forest.predict(X_test)
accuracy_forest = accuracy_score(y_test, output_forest)
accuracy_forest
print classification_report(y_test,output_forest)
Opening Movie Gross Profit Predictions¶
In [75]:
clf_lr.predict(opening_movies)
Out[75]:
Movie | Budget | Sentiment | Success |
---|---|---|---|
The Divergent Series: Allegiant | 115M | 40 | 0 |
Miracles From Heaven | 13M | 41 | 0 |
Midnight Special | 18M | 29 | 0 |
The Bronze | 3.5M | -9 | 1 |
The Program | 1.7M | 9 | 1 |
My Golden Days | 1.7M | 26 | 1 |
Batman v Superman: Dawn of Justice | 250M | 23 | 0 |