import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from scipy.stats import pearsonr
import random
warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
data = pd.read_csv('../dataset/processed/cleaned_data.csv')
data.head()
Movie_Title | Movie_ID | Budget | Cast | Crew | Studios | Genre | Keywords | Languages | Countries | Release_Data | Runtime | Gross_worldwide | Rating | Rating_Count | ListOfCertificate | Release_Year | Release_Month | Release_Day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Star Wars: Episode VII - The Force Awakens | 2488496 | 245000000 | ['Daisy Ridley', 'John Boyega', 'Oscar Isaac',... | ['Lawrence Kasdan', 'Michael Arndt', 'J.J. Abr... | ['Lucasfilm', 'Bad Robot', 'Truenorth Producti... | ['Action', 'Adventure', 'Sci-Fi'] | ['reboot', 'sanitation employee', 'remake', 'c... | ['English'] | ['United States'] | 2015-12-18 | 138.0 | 2069521700 | 7.8 | 893000 | ['PG-13'] | 2015 | 12 | 18 |
1 | Frozen II | 4520988 | 150000000 | ['Kristen Bell', 'Idina Menzel', 'Josh Gad', '... | ['Jennifer Lee', 'Hans Christian Andersen', 'C... | ['Walt Disney Animation Studios', 'Walt Disney... | ['Animation', 'Adventure', 'Comedy', 'Family',... | ['autumn', 'anthropomorphic snowman', 'princes... | ['English'] | ['United States'] | 2019-11-22 | 103.0 | 1450026933 | 6.8 | 156000 | ['PG'] | 2019 | 11 | 22 |
2 | The Dark Knight Rises | 1345836 | 250000000 | ['Christian Bale', 'Tom Hardy', 'Anne Hathaway... | ['Jonathan Nolan', 'Christopher Nolan', 'David... | ['Warner Bros.', 'Legendary Entertainment', 'D... | ['Action', 'Crime', 'Drama'] | ['dc comics', 'batman character', 'bruce wayne... | ['English', 'Arabic'] | ['United Kingdom', 'United States'] | 2012-07-27 | 164.0 | 1081142612 | 8.4 | 1600000 | ['PG-13'] | 2012 | 7 | 27 |
3 | Beauty and the Beast | 2771200 | 160000000 | ['Emma Watson', 'Dan Stevens', 'Luke Evans', '... | ['Evan Spiliotopoulos', 'Bill Condon', 'Stephe... | ['Mandeville Films', 'Walt Disney Pictures'] | ['Adventure', 'Family', 'Fantasy', 'Musical', ... | ['beast', 'fairy tale', 'heroine', "beast's he... | ['English'] | ['United States'] | 2017-03-17 | 129.0 | 1273576220 | 7.1 | 293000 | ['PG'] | 2017 | 3 | 17 |
4 | Finding Dory | 2277860 | 200000000 | ['Ellen DeGeneres', 'Albert Brooks', "Ed O'Nei... | ['Angus MacLane', 'Victoria Strouse', 'Andrew ... | ['Pixar Animation Studios', 'Walt Disney Pictu... | ['Animation', 'Adventure', 'Comedy', 'Family'] | ['fish', 'ocean', 'whale', 'octopus driving a ... | ['English', 'Indonesian'] | ['United States'] | 2016-06-17 | 97.0 | 1028570942 | 7.3 | 259000 | ['PG'] | 2016 | 6 | 17 |
Load multivalued attributes
import ast
cols = ['Cast', 'Genre', 'Studios', 'ListOfCertificate', 'Keywords', 'Languages', 'Countries', 'Crew']
for col in cols:
data[col] = data[col].apply(ast.literal_eval)
dataframe
: Dataframe to processcol_name
: Name of the column to process, Cast
, Genre
and Studios
for exampledef parseWithMoneyAndCount(dataframe, col_name):
res = []
count = []
gross = []
for index, record in enumerate(dataframe[col_name]):
for x in record:
# Save results to corresponding array
res.append(x)
gross.append(dataframe['Gross_worldwide'][index])
count.append(1)
# Make dataframe; remove duplicates and sum corresponding columns
t1 = pd.DataFrame({col_name: res, 'Total': gross, 'Count': count})
result1 = t1.groupby(col_name).sum()
result1.reset_index(inplace=True)
t2 = pd.DataFrame({col_name: res, 'Mean': gross})
result2 = t2.groupby(col_name).mean()
result2.reset_index(inplace=True)
result = result1.merge(result2, on=col_name, how='inner')
t3 = pd.DataFrame({col_name: res, 'Median': gross})
result3 = t3.groupby(col_name).median()
result3.reset_index(inplace=True)
result = result.merge(result3, on=col_name, how='inner')
return result
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8752 entries, 0 to 8751 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Movie_Title 8752 non-null object 1 Movie_ID 8752 non-null int64 2 Budget 8752 non-null int64 3 Cast 8752 non-null object 4 Crew 8752 non-null object 5 Studios 8752 non-null object 6 Genre 8752 non-null object 7 Keywords 8752 non-null object 8 Languages 8752 non-null object 9 Countries 8752 non-null object 10 Release_Data 8752 non-null object 11 Runtime 8752 non-null float64 12 Gross_worldwide 8752 non-null int64 13 Rating 8752 non-null float64 14 Rating_Count 8752 non-null int64 15 ListOfCertificate 8752 non-null object 16 Release_Year 8752 non-null int64 17 Release_Month 8752 non-null int64 18 Release_Day 8752 non-null int64 dtypes: float64(2), int64(7), object(10) memory usage: 1.3+ MB
data.describe()
Movie_ID | Budget | Runtime | Gross_worldwide | Rating | Rating_Count | Release_Year | Release_Month | Release_Day | |
---|---|---|---|---|---|---|---|---|---|
count | 8.752000e+03 | 8.752000e+03 | 8752.000000 | 8.752000e+03 | 8752.000000 | 8.752000e+03 | 8752.000000 | 8752.000000 | 8752.000000 |
mean | 1.044857e+06 | 2.351619e+07 | 108.012117 | 7.172680e+07 | 6.430884 | 8.347387e+04 | 2000.010398 | 6.688643 | 16.042162 |
std | 1.743793e+06 | 3.713275e+07 | 19.559340 | 1.564077e+08 | 1.005946 | 1.641414e+05 | 13.471522 | 3.416011 | 8.602773 |
min | 1.234900e+04 | 2.200000e+02 | 45.000000 | 9.500000e+01 | 1.400000 | 0.000000e+00 | 1921.000000 | 1.000000 | 1.000000 |
25% | 1.023130e+05 | 2.200000e+02 | 95.000000 | 4.443069e+06 | 5.800000 | 7.100000e+03 | 1991.000000 | 4.000000 | 9.000000 |
50% | 2.504325e+05 | 1.000000e+07 | 105.000000 | 1.821152e+07 | 6.500000 | 2.600000e+04 | 2002.000000 | 7.000000 | 16.000000 |
75% | 1.221610e+06 | 3.000000e+07 | 117.000000 | 6.560984e+07 | 7.200000 | 8.600000e+04 | 2011.000000 | 10.000000 | 23.000000 |
max | 1.103237e+07 | 3.560000e+08 | 357.000000 | 2.847246e+09 | 9.300000 | 2.500000e+06 | 2021.000000 | 12.000000 | 31.000000 |
cols = ['Budget', 'Runtime', 'Release_Year', 'Gross_worldwide', 'Rating', 'Rating_Count', 'Release_Month']
info = data[cols]
sns.heatmap(info.corr(), annot=True)
<AxesSubplot:>
data.hist(bins=50, figsize=(20, 15))
array([[<AxesSubplot:title={'center':'Movie_ID'}>, <AxesSubplot:title={'center':'Budget'}>, <AxesSubplot:title={'center':'Runtime'}>], [<AxesSubplot:title={'center':'Gross_worldwide'}>, <AxesSubplot:title={'center':'Rating'}>, <AxesSubplot:title={'center':'Rating_Count'}>], [<AxesSubplot:title={'center':'Release_Year'}>, <AxesSubplot:title={'center':'Release_Month'}>, <AxesSubplot:title={'center':'Release_Day'}>]], dtype=object)
from pandas.plotting import scatter_matrix
scatter_matrix(info, figsize=(20, 12), hist_kwds={'bins': 50})
array([[<AxesSubplot:xlabel='Budget', ylabel='Budget'>, <AxesSubplot:xlabel='Runtime', ylabel='Budget'>, <AxesSubplot:xlabel='Release_Year', ylabel='Budget'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Budget'>, <AxesSubplot:xlabel='Rating', ylabel='Budget'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Budget'>, <AxesSubplot:xlabel='Release_Month', ylabel='Budget'>], [<AxesSubplot:xlabel='Budget', ylabel='Runtime'>, <AxesSubplot:xlabel='Runtime', ylabel='Runtime'>, <AxesSubplot:xlabel='Release_Year', ylabel='Runtime'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Runtime'>, <AxesSubplot:xlabel='Rating', ylabel='Runtime'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Runtime'>, <AxesSubplot:xlabel='Release_Month', ylabel='Runtime'>], [<AxesSubplot:xlabel='Budget', ylabel='Release_Year'>, <AxesSubplot:xlabel='Runtime', ylabel='Release_Year'>, <AxesSubplot:xlabel='Release_Year', ylabel='Release_Year'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Release_Year'>, <AxesSubplot:xlabel='Rating', ylabel='Release_Year'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Release_Year'>, <AxesSubplot:xlabel='Release_Month', ylabel='Release_Year'>], [<AxesSubplot:xlabel='Budget', ylabel='Gross_worldwide'>, <AxesSubplot:xlabel='Runtime', ylabel='Gross_worldwide'>, <AxesSubplot:xlabel='Release_Year', ylabel='Gross_worldwide'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Gross_worldwide'>, <AxesSubplot:xlabel='Rating', ylabel='Gross_worldwide'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Gross_worldwide'>, <AxesSubplot:xlabel='Release_Month', ylabel='Gross_worldwide'>], [<AxesSubplot:xlabel='Budget', ylabel='Rating'>, <AxesSubplot:xlabel='Runtime', ylabel='Rating'>, <AxesSubplot:xlabel='Release_Year', ylabel='Rating'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Rating'>, <AxesSubplot:xlabel='Rating', ylabel='Rating'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Rating'>, <AxesSubplot:xlabel='Release_Month', ylabel='Rating'>], [<AxesSubplot:xlabel='Budget', ylabel='Rating_Count'>, <AxesSubplot:xlabel='Runtime', ylabel='Rating_Count'>, <AxesSubplot:xlabel='Release_Year', ylabel='Rating_Count'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Rating_Count'>, <AxesSubplot:xlabel='Rating', ylabel='Rating_Count'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Rating_Count'>, <AxesSubplot:xlabel='Release_Month', ylabel='Rating_Count'>], [<AxesSubplot:xlabel='Budget', ylabel='Release_Month'>, <AxesSubplot:xlabel='Runtime', ylabel='Release_Month'>, <AxesSubplot:xlabel='Release_Year', ylabel='Release_Month'>, <AxesSubplot:xlabel='Gross_worldwide', ylabel='Release_Month'>, <AxesSubplot:xlabel='Rating', ylabel='Release_Month'>, <AxesSubplot:xlabel='Rating_Count', ylabel='Release_Month'>, <AxesSubplot:xlabel='Release_Month', ylabel='Release_Month'>]], dtype=object)
final = data.copy()
Genre
analysis¶genre = parseWithMoneyAndCount(data, 'Genre')
genre
Genre | Total | Count | Mean | Median | |
---|---|---|---|---|---|
0 | Action | 270712872044 | 1892 | 1.430829e+08 | 50747624.5 |
1 | Adventure | 298281440727 | 1515 | 1.968854e+08 | 68514844.0 |
2 | Animation | 84311140430 | 400 | 2.107779e+08 | 104469116.5 |
3 | Biography | 26964736252 | 663 | 4.067079e+07 | 13448497.0 |
4 | Comedy | 222959609180 | 3232 | 6.898503e+07 | 20980459.5 |
5 | Crime | 87149222552 | 1589 | 5.484533e+07 | 19870567.0 |
6 | Documentary | 2710526736 | 243 | 1.115443e+07 | 2702578.0 |
7 | Drama | 224422468047 | 4454 | 5.038672e+07 | 14657538.0 |
8 | Family | 137386773092 | 936 | 1.467807e+08 | 51444620.5 |
9 | Fantasy | 153116140508 | 981 | 1.560817e+08 | 50693129.0 |
10 | Film-Noir | 4890859 | 11 | 4.446235e+05 | 22356.0 |
11 | History | 18744144753 | 391 | 4.793899e+07 | 13130349.0 |
12 | Horror | 39714174296 | 707 | 5.617281e+07 | 25051865.0 |
13 | Music | 19600102040 | 449 | 4.365279e+07 | 11749595.0 |
14 | Musical | 30446413474 | 279 | 1.091269e+08 | 22762571.0 |
15 | Mystery | 62314981386 | 880 | 7.081248e+07 | 23673201.0 |
16 | News | 57917412 | 3 | 1.930580e+07 | 4606199.0 |
17 | Romance | 109277844855 | 2201 | 4.964918e+07 | 15164458.0 |
18 | Sci-Fi | 152723454914 | 866 | 1.763550e+08 | 51939597.0 |
19 | Sport | 16388866644 | 374 | 4.382050e+07 | 17712898.0 |
20 | Thriller | 169670494650 | 2224 | 7.629069e+07 | 26492560.5 |
21 | War | 20548159884 | 357 | 5.755787e+07 | 15291277.0 |
22 | Western | 7393407155 | 143 | 5.170215e+07 | 15164458.0 |
genre.sort_values(by='Count', ascending=False, inplace=True)
genre.plot.bar(x='Genre', y='Count')
genre.sort_values(by='Mean', ascending=False, inplace=True)
genre.plot.bar(x='Genre', y='Mean', color="salmon")
genre.sort_values(by='Median', ascending=False, inplace=True)
genre.plot.bar(x='Genre', y='Median', color="salmon")
<AxesSubplot:xlabel='Genre'>
From the bar chart that the following genres generate the highest gross:
Film-Noir stays at the lowest position
News, though with only 1 record, is in the top 6 of the median plot
genre.sort_values(by='Count', ascending=False)
Genre | Total | Count | Mean | Median | |
---|---|---|---|---|---|
7 | Drama | 224422468047 | 4454 | 5.038672e+07 | 14657538.0 |
4 | Comedy | 222959609180 | 3232 | 6.898503e+07 | 20980459.5 |
20 | Thriller | 169670494650 | 2224 | 7.629069e+07 | 26492560.5 |
17 | Romance | 109277844855 | 2201 | 4.964918e+07 | 15164458.0 |
0 | Action | 270712872044 | 1892 | 1.430829e+08 | 50747624.5 |
5 | Crime | 87149222552 | 1589 | 5.484533e+07 | 19870567.0 |
1 | Adventure | 298281440727 | 1515 | 1.968854e+08 | 68514844.0 |
9 | Fantasy | 153116140508 | 981 | 1.560817e+08 | 50693129.0 |
8 | Family | 137386773092 | 936 | 1.467807e+08 | 51444620.5 |
15 | Mystery | 62314981386 | 880 | 7.081248e+07 | 23673201.0 |
18 | Sci-Fi | 152723454914 | 866 | 1.763550e+08 | 51939597.0 |
12 | Horror | 39714174296 | 707 | 5.617281e+07 | 25051865.0 |
3 | Biography | 26964736252 | 663 | 4.067079e+07 | 13448497.0 |
13 | Music | 19600102040 | 449 | 4.365279e+07 | 11749595.0 |
2 | Animation | 84311140430 | 400 | 2.107779e+08 | 104469116.5 |
11 | History | 18744144753 | 391 | 4.793899e+07 | 13130349.0 |
19 | Sport | 16388866644 | 374 | 4.382050e+07 | 17712898.0 |
21 | War | 20548159884 | 357 | 5.755787e+07 | 15291277.0 |
14 | Musical | 30446413474 | 279 | 1.091269e+08 | 22762571.0 |
6 | Documentary | 2710526736 | 243 | 1.115443e+07 | 2702578.0 |
22 | Western | 7393407155 | 143 | 5.170215e+07 | 15164458.0 |
10 | Film-Noir | 4890859 | 11 | 4.446235e+05 | 22356.0 |
16 | News | 57917412 | 3 | 1.930580e+07 | 4606199.0 |
We will develop a Genre Rank based on Median but will ignore the film that have number of releases < 10
genre = genre[genre['Count'] > 10]
genre.sort_values(by='Median', ascending=True, inplace=True)
genre.reset_index(drop=True, inplace=True)
genreRank = dict()
for i, row in enumerate(genre['Genre']):
genreRank[row] = i + 1
genreRank
{'Film-Noir': 1, 'Documentary': 2, 'Music': 3, 'History': 4, 'Biography': 5, 'Drama': 6, 'Romance': 7, 'Western': 8, 'War': 9, 'Sport': 10, 'Crime': 11, 'Comedy': 12, 'Musical': 13, 'Mystery': 14, 'Horror': 15, 'Thriller': 16, 'Fantasy': 17, 'Action': 18, 'Family': 19, 'Sci-Fi': 20, 'Adventure': 21, 'Animation': 22}
We rank the genres by the highest rank of Genre in the list of Genres(Since Genre is multivalued attributes)
def getRank(listGenre):
max = -1
for genre in listGenre:
if genre not in genreRank.keys():
continue
if genreRank[genre] > max:
max = genreRank[genre]
if max != -1:
return max
return 0
data2 = data.copy()
data2['GenreRank'] = data2['Genre'].apply(getRank)
corr = pearsonr(data2['GenreRank'], data2['Gross_worldwide'])[0]
print('Pearsonr correlation between GenreRank and Gross: %.3f' % corr)
data2.plot.scatter(x='GenreRank', y='Gross_worldwide', color='blue')
Pearsonr correlation between GenreRank and Gross: 0.253
<AxesSubplot:xlabel='GenreRank', ylabel='Gross_worldwide'>
We rank the genre by total rank of Genre in the list of Genres
def getRank(listGenre):
max = -1
for genre in listGenre:
if genre not in genreRank.keys():
continue
max += genreRank[genre]
if max != -1:
return max
return 0
data2 = data.copy()
data2['GenreRank'] = data2['Genre'].apply(getRank)
corr = pearsonr(data2['GenreRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between GenreRank and Gross: %.3f' % corr)
data2.plot.scatter(x='GenreRank', y='Gross_worldwide', color='royalblue')
Pearsons correlation between GenreRank and Gross: 0.316
<AxesSubplot:xlabel='GenreRank', ylabel='Gross_worldwide'>
We we rank the genre by average rank of Genre in the list of Genres(Since Genre is multivalued attributes)
def getRank(listGenre):
max = 0
for genre in listGenre:
if genre not in genreRank.keys():
continue
max += genreRank[genre]
if max == 0:
return random.randint(1, 15)
return max / len(listGenre)
data2 = data.copy()
data2['GenreRank'] = data2['Genre'].apply(getRank)
corr = pearsonr(data2['GenreRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between GenreRank and Gross: %.3f' % corr)
data2.plot.scatter(x='GenreRank', y='Gross_worldwide', color='royalblue')
Pearsons correlation between GenreRank and Gross: 0.325
<AxesSubplot:xlabel='GenreRank', ylabel='Gross_worldwide'>
final['GenreRank'] = data2['GenreRank']
Now we will test the hypothesis that movie with Genre Adventure will have more value than other movies
data2 = data.copy()
data2['IsAdventure'] = data2['Genre'].apply(lambda x: 1 if 'Adventure' in x else 0)
corr = pearsonr(data2['IsAdventure'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between IsAdventure and Gross: %.3f' % corr)
data2.plot.scatter(x='IsAdventure', y='Gross_worldwide', color='royalblue')
Pearsons correlation between IsAdventure and Gross: 0.366
<AxesSubplot:xlabel='IsAdventure', ylabel='Gross_worldwide'>
We will choose this one since it has high correlation of 0.366
final['IsAdventure'] = data2['IsAdventure']
data2 = data.copy()
data2['Release_Data'] = pd.to_datetime(data2['Release_Data'], format='%Y-%m-%d')
data2 = data2.assign(WeekDay=data2['Release_Data'].dt.weekday + 2)
data2['WeekDay'].value_counts()
6 6790 4 1004 5 624 7 131 3 89 2 60 8 54 Name: WeekDay, dtype: int64
data['Release_Day'].value_counts().reset_index().plot.bar(x='index', y='Release_Day')
<AxesSubplot:xlabel='index'>
data2['WeekDay'].value_counts().reset_index().sort_values(by='index').plot.bar(x='index', y='WeekDay')
<AxesSubplot:xlabel='index'>
Movies tend to release in Friday
data2.plot.scatter(x='WeekDay', y='Gross_worldwide', color='brown')
<AxesSubplot:xlabel='WeekDay', ylabel='Gross_worldwide'>
No linear relation
corr = pearsonr(data2['WeekDay'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between WeekDay and Gross: %.3f' % corr)
data2.plot.scatter(x='WeekDay', y='Gross_worldwide', color='blue')
Pearsons correlation between WeekDay and Gross: 0.003
<AxesSubplot:xlabel='WeekDay', ylabel='Gross_worldwide'>
Hypothesis: Did movies released in Friday or Wednesday have more gross then others?
data2['IsFriWed'] = data2['WeekDay'].apply(lambda x: 1 if x in [4, 6] else 0)
corr = pearsonr(data2['IsFriWed'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between IsFriWed and Gross: %.3f' % corr)
data2.plot.scatter(x='IsFriWed', y='Gross_worldwide', color='blue')
Pearsons correlation between IsFriWed and Gross: 0.047
<AxesSubplot:xlabel='IsFriWed', ylabel='Gross_worldwide'>
Not very relevant between them!
Number of release by month
month = data['Release_Month'].value_counts().reset_index().sort_values(by='index', ascending=True)
month = month.rename(columns={'index': 'Month', 'Release_Month': 'ReleaseCount'})
month.plot.bar(x='Month', y='ReleaseCount', color='green')
<AxesSubplot:xlabel='Month'>
How average gross depend on Release_Month and combine with Release_Year.
cols = ['Release_Month', 'Gross_worldwide']
data.plot.scatter(x='Release_Month', y='Gross_worldwide', color='green')
plt.title("Scatter plot for Release_Month and Gross")
#3
gColor = [6, 7, 12]
rColor = [1, 2, 3, 4, 5, 8, 9, 10, 11]
colorMap = dict()
for i in gColor:
colorMap[i] = 'tab:green'
for i in rColor:
colorMap[i] = 'tab:red'
C = colorMap
fig = plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Release_Year', y='Gross_worldwide', hue='Release_Month', palette=C)
plt.title("How gross distributed by Month and Year")
plt.show()
We can see that gross tends to increase by year.
corr = pearsonr(data2['Release_Year'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between Release_Year and Gross: %.3f' % corr)
Pearsons correlation between Release_Year and Gross: 0.209
cols = ['Release_Month', 'Gross_worldwide']
month = data[cols]
month = month.groupby("Release_Month").median().reset_index()
#
month.plot.bar(x='Release_Month', y='Gross_worldwide')
plt.title("Median by Month")
Text(0.5, 1.0, 'Median by Month')
By Median Plot of Month and Gross. We can see that 5,6,7,12 is the month that are top month(We call special month)
def getSpecialMonth(month):
specialMonth = [6, 7, 12]
if month in specialMonth:
return 1
else:
return 0
data2 = data.copy()
data2['SpecialMonth'] = data2['Release_Month'].apply(getSpecialMonth)
corr = pearsonr(data2['SpecialMonth'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between SpecialMonth and Gross: %.3f' % corr)
data2.plot.scatter(x='SpecialMonth', y='Gross_worldwide', color='blue')
Pearsons correlation between SpecialMonth and Gross: 0.120
<AxesSubplot:xlabel='SpecialMonth', ylabel='Gross_worldwide'>
This correlation is low. We will consider by Mean instead
month = data[cols]
month = month.groupby("Release_Month").mean().reset_index()
#1
month.plot.bar(x='Release_Month', y='Gross_worldwide')
plt.title("Average by Month")
Text(0.5, 1.0, 'Average by Month')
By Mean Plot of Month and Gross. We can see that 5,6,7,11,12 is the month that are top month(We call special month)
def getSpecialMonth(month):
specialMonth = [5, 6, 7, 11, 12]
if month in specialMonth:
return 1
else:
return 0
data2 = data.copy()
data2['SpecialMonth'] = data2['Release_Month'].apply(getSpecialMonth)
corr = pearsonr(data2['SpecialMonth'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between SpecialMonth and Gross: %.3f' % corr)
data2.plot.scatter(x='SpecialMonth', y='Gross_worldwide', color='blue')
Pearsons correlation between SpecialMonth and Gross: 0.142
<AxesSubplot:xlabel='SpecialMonth', ylabel='Gross_worldwide'>
This has higher correlation so we will choose it
final['SpecialMonth'] = data2['SpecialMonth']
data['Gross_worldwide'].describe()
count 8.752000e+03 mean 7.172680e+07 std 1.564077e+08 min 9.500000e+01 25% 4.443069e+06 50% 1.821152e+07 75% 6.560984e+07 max 2.847246e+09 Name: Gross_worldwide, dtype: float64
Min value is 2100 $
data['Budget'].describe()
count 8.752000e+03 mean 2.351619e+07 std 3.713275e+07 min 2.200000e+02 25% 2.200000e+02 50% 1.000000e+07 75% 3.000000e+07 max 3.560000e+08 Name: Budget, dtype: float64
No 0$ value Budget (Min value is 15000$)
Relation between Budget and Gross_worldwide
sns.lmplot(data=data, x='Budget', y='Gross_worldwide')
<seaborn.axisgrid.FacetGrid at 0x2843b774760>
Now we will test the correlation between Budget and Gross_WorldWide
corr = pearsonr(data['Budget'], data['Gross_worldwide'])[0]
print('Pearsons correlation between Budget and Gross: %.3f' % corr)
Pearsons correlation between Budget and Gross: 0.741
Budget and Gross_worldwide correlation may perform very well in our future model
Hypothesis: Is Gross depend on number of actors showed on movie's imdb webpage?
data2 = data.copy()
data2['numCast'] = data2['Cast'].apply(lambda x: len(x))
corr = pearsonr(data2['numCast'], data2['Gross_worldwide'])[0]
data2['numCast'].value_counts()
18 8203 16 66 17 64 15 62 14 53 13 48 12 45 10 35 11 35 9 26 8 23 7 17 6 17 5 12 1 11 4 11 3 11 2 8 0 5 Name: numCast, dtype: int64
corr = pearsonr(data2['numCast'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between numCast and Gross: %.3f' % corr)
data2.plot.scatter(x='numCast', y='Gross_worldwide', color='brown')
Pearsons correlation between numCast and Gross: 0.059
<AxesSubplot:xlabel='numCast', ylabel='Gross_worldwide'>
->>Very low correlation
Since all imdb pages tend to have more than 17 casts per page
Cast and average gross of the movies they cast for
cast = parseWithMoneyAndCount(data, 'Cast')
cast.sort_values(by='Count', ascending=False, inplace=True)
cast
Cast | Total | Count | Mean | Median | |
---|---|---|---|---|---|
61269 | Samuel L. Jackson | 18580682439 | 83 | 2.238636e+08 | 62022014.0 |
58569 | Robert De Niro | 6722811414 | 77 | 8.730924e+07 | 45491656.0 |
49987 | Morgan Freeman | 10397736669 | 65 | 1.599652e+08 | 95943453.0 |
33993 | John Goodman | 6019264965 | 62 | 9.708492e+07 | 37207906.0 |
41569 | Liam Neeson | 7206028513 | 61 | 1.181316e+08 | 48878502.0 |
... | ... | ... | ... | ... | ... |
28218 | István Znamenák | 1240663 | 1 | 1.240663e+06 | 1240663.0 |
28219 | Italia Coppola | 636796 | 1 | 6.367960e+05 | 636796.0 |
28221 | Italo Renda | 34861529 | 1 | 3.486153e+07 | 34861529.0 |
28222 | Itandehui Gutierrez | 120673227 | 1 | 1.206732e+08 | 120673227.0 |
73025 | Þrúður Kristjánsdóttir | 727594 | 1 | 7.275940e+05 | 727594.0 |
73026 rows × 5 columns
fig = plt.figure(figsize=(8, 6))
# plt.subplot(2,1,1)
data2 = cast.sort_values(by='Count', ascending=False)[0:15]
plt.bar(data=data2, x='Cast', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Number movies", fontsize=20)
plt.title("Cast and Number of Movies they cast for", fontsize=20)
Text(0.5, 1.0, 'Cast and Number of Movies they cast for')
Our data has name of 73026 casts
cast3Movies = cast[cast['Count'] < 3]
cast3Movies.sort_values(by='Median', ascending=False, inplace=True)
cast3Movies
Cast | Total | Count | Mean | Median | |
---|---|---|---|---|---|
62173 | Sean Anthony Moran | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
30536 | Jason Whyte | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
62306 | Sean Patrick Murphy | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
29659 | James Patrick Pitt | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
38160 | Kelly Kilgour | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
... | ... | ... | ... | ... | ... |
25851 | Hammou Abaou | 95 | 1 | 9.500000e+01 | 9.500000e+01 |
31262 | Jeff Prewett | 95 | 1 | 9.500000e+01 | 9.500000e+01 |
28953 | Jacqueline Harris | 95 | 1 | 9.500000e+01 | 9.500000e+01 |
46652 | Matthew R. Anderson | 95 | 1 | 9.500000e+01 | 9.500000e+01 |
34893 | Jonas Ball | 95 | 1 | 9.500000e+01 | 9.500000e+01 |
61310 rows × 5 columns
38972 Casts with lower than 3 movies cast for and some people with very high Median Value
It will cause problem of not reliable rank when we try to develop a rank system is this type of fields
fig = plt.figure(figsize=(8, 6))
# plt.subplot(2,1,1)
data2 = cast.sort_values(by='Total', ascending=False)[0:20]
plt.bar(data=data2, x='Cast', height='Total', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Total Gross", fontsize=15)
plt.title("Cast and Total Gross of Movies they cast for", fontsize=15)
fig = plt.figure(figsize=(8, 6))
# plt.subplot(2,1,2)
data3 = cast.sort_values(by='Mean', ascending=False)[0:20]
plt.bar(data=data3, x='Cast', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Cast and Average Gross of Movies they cast for", fontsize=15)
plt.xlabel("Cast", fontsize=15)
plt.ylabel("Average Gross", fontsize=15)
fig = plt.figure(figsize=(8, 6))
data3 = cast.sort_values(by='Median', ascending=False)[0:20]
plt.bar(data=data3, x='Cast', height='Median', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Cast and Median Gross of Movies they cast for", fontsize=15)
plt.xlabel("Cast", fontsize=15)
plt.ylabel("Median Gross", fontsize=15)
plt.show()
Those casts name is not very well-known. Jason Whyte and Sean Anthony Moran is casts with only 1 movies attended
Now we will develop a rank system for cast (Find top leader casts)
The number of movies they cast for must larger then 5 movies
cast = parseWithMoneyAndCount(data, 'Cast')
cast.sort_values(by='Count', ascending=False, inplace=True)
cast10Movies = cast[cast['Count'] > 5]
cast10Movies.sort_values(by='Mean', ascending=False, inplace=True)
cast10Movies.reset_index(drop=True, inplace=True)
cast10Movies
Cast | Total | Count | Mean | Median | |
---|---|---|---|---|---|
0 | Rupert Grint | 7786412658 | 8 | 9.733016e+08 | 9.383132e+08 |
1 | Lupita Nyong'o | 7278884954 | 8 | 9.098606e+08 | 1.020352e+09 |
2 | Evangeline Lilly | 6209206084 | 7 | 8.870294e+08 | 6.226741e+08 |
3 | Chadwick Boseman | 7668338102 | 9 | 8.520376e+08 | 1.506809e+08 |
4 | Karen Gillan | 8474182724 | 10 | 8.474183e+08 | 7.867049e+08 |
... | ... | ... | ... | ... | ... |
4569 | Deborah Kerr | 21481898 | 6 | 3.580316e+06 | 3.325950e+04 |
4570 | Rita Taggart | 19991204 | 6 | 3.331867e+06 | 2.243077e+06 |
4571 | Tony Curtis | 26234046 | 8 | 3.279256e+06 | 1.610252e+06 |
4572 | Dick Cavett | 30866179 | 10 | 3.086618e+06 | 1.324484e+06 |
4573 | Arsinée Khanjian | 15305635 | 6 | 2.550939e+06 | 3.003460e+06 |
4574 rows × 5 columns
plt.figure(figsize=(8, 6))
data3 = cast10Movies.sort_values(by='Mean', ascending=False)[0:25]
plt.bar(data=data3, x='Cast', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Cast and Average Gross of Movies they cast for", fontsize=15)
plt.xlabel("Cast", fontsize=15)
plt.ylabel("Average Gross", fontsize=15)
plt.show()
We call them leader cast. And now move on to develop rank
cast10Movies.sort_values(by='Mean', ascending=True, inplace=True)
castRank = dict()
for i, row in enumerate(cast10Movies['Cast']):
castRank[row] = i + 1
castRank
{'Arsinée Khanjian': 1, 'Dick Cavett': 2, 'Tony Curtis': 3, 'Rita Taggart': 4, 'Deborah Kerr': 5, 'John Lennon': 6, 'Victoria Abril': 7, 'Jill Schoelen': 8, 'Alison Steadman': 9, 'Jacques Mathou': 10, 'Lior Ashkenazi': 11, 'Ramon Bieri': 12, 'Mink Stole': 13, 'Traci Lind': 14, 'Emilio Fernández': 15, 'Jim Metzler': 16, 'Kathryn Grody': 17, 'Cheryl Ladd': 18, 'Patrick Huard': 19, 'John Cullum': 20, 'Ricki Lake': 21, 'Jack Rader': 22, 'Blu Mankuma': 23, 'Peter Eyre': 24, 'Gore Vidal': 25, 'Don McKellar': 26, 'Suniel Shetty': 27, 'Matt Keeslar': 28, 'Anne-Marie Johnson': 29, 'Humphrey Bogart': 30, 'Luke Askew': 31, 'Marcello Mastroianni': 32, 'Richard Widmark': 33, 'John Savident': 34, 'Vrajesh Hirjee': 35, 'Millie Perkins': 36, 'Tim Barlow': 37, 'Anna Massey': 38, 'Jodie Markell': 39, 'Amy Locane': 40, 'Louise Latham': 41, 'Gordon Pinsent': 42, 'Jennifer Edwards': 43, 'Ingrid Bergman': 44, 'Debbie Harry': 45, 'Woody Strode': 46, 'Sandy Baron': 47, 'Louise Lasser': 48, 'Tom Burke': 49, 'John Considine': 50, 'Niall Buggy': 51, 'Al White': 52, 'Kerry Fox': 53, 'Albert Delpy': 54, 'Orson Welles': 55, 'George Dickerson': 56, 'Ken Wahl': 57, 'Nastassja Kinski': 58, 'John Sayles': 59, 'Sherilyn Fenn': 60, 'Perry Lang': 61, 'Raghuvir Yadav': 62, 'Eric Payne': 63, 'Rémy Girard': 64, 'Dhritiman Chatterjee': 65, 'Danton Stone': 66, 'Alan Bates': 67, 'John F. Kennedy': 68, 'Laura Morante': 69, 'Gabriele Ferzetti': 70, 'Isabelle Adjani': 71, 'Robert Townsend': 72, 'Teddy Wilson': 73, 'Alberta Watson': 74, 'Ratna Pathak Shah': 75, 'Steve James': 76, 'Tristram Jellinek': 77, 'Perry Lopez': 78, 'Ian Wolfe': 79, 'Kirk Douglas': 80, 'William Russ': 81, 'Brooke Adams': 82, 'Michael Hordern': 83, 'Kieu Chinh': 84, 'Tony Frank': 85, 'John Lynch': 86, 'Antonia Rey': 87, 'Camille Saviola': 88, 'Sonakshi Sinha': 89, 'Harvey Atkin': 90, 'Gilda Radner': 91, 'Wings Hauser': 92, 'Murli Sharma': 93, 'Prem Chopra': 94, 'Mukesh Tiwari': 95, 'Jamie Tirelli': 96, 'Barack Obama': 97, 'Anne Pitoniak': 98, 'Ticky Holgado': 99, 'Divya Dutta': 100, 'Lesley-Anne Down': 101, 'Vipin Sharma': 102, 'Molly Parker': 103, 'Lysette Anthony': 104, 'Puneet Issar': 105, 'Brijendra Kala': 106, 'Charles Bronson': 107, 'John P. Ryan': 108, 'Charles Cioffi': 109, 'David Dukes': 110, 'Leonard L. Thomas': 111, 'Leonard Termo': 112, 'Brady Corbet': 113, 'Wanda De Jesus': 114, 'David Dwyer': 115, 'Tom Hickey': 116, 'Lonny Chapman': 117, 'Kulbhushan Kharbanda': 118, 'Miou-Miou': 119, 'Neil Young': 120, 'Michael Gross': 121, 'Gene Kelly': 122, 'Don Francks': 123, 'Kirby Heyborne': 124, 'Steve Railsback': 125, 'Philip Akin': 126, 'Richard Brooks': 127, 'Manoj Pahwa': 128, 'Ángela Molina': 129, 'Melonie Diaz': 130, 'Arye Gross': 131, 'William McNamara': 132, 'Rani Mukerji': 133, 'Vincent Spano': 134, 'John Ventimiglia': 135, 'Tinnu Anand': 136, 'Manoj Bajpayee': 137, 'James Luisi': 138, 'Vidya Balan': 139, 'Lesley Ann Warren': 140, 'Peter Cook': 141, 'Farnesio de Bernal': 142, 'Ray Sharkey': 143, 'Ray McAnally': 144, 'Lawrence Tierney': 145, 'Troy Byer': 146, 'Ron Leibman': 147, 'Nigel Terry': 148, 'Richard Jaeckel': 149, 'Bill Paterson': 150, 'Daniel Baldwin': 151, 'Jeanine Jackson': 152, 'Mariel Hemingway': 153, 'Robert Trebor': 154, 'Victoria Jackson': 155, 'Billy Green Bush': 156, 'Meg Foster': 157, 'Billy Jayne': 158, 'Saif Ali Khan': 159, 'Christopher Fulford': 160, 'Richard Bradford': 161, 'Rachel Ward': 162, 'Eleanor Bron': 163, 'Preity Zinta': 164, 'Sam Waterston': 165, 'Henry Silva': 166, 'Wayne Robson': 167, 'Matthew Faison': 168, 'Jean-Marc Barr': 169, 'David Hemblen': 170, 'Terry Kiser': 171, 'Timothy Jerome': 172, 'David Johansen': 173, 'Annie Golden': 174, 'Daniel Auteuil': 175, 'Sandra Bernhard': 176, 'Johny Lever': 177, 'Edie Falco': 178, 'Molly Hagan': 179, 'Tony Roberts': 180, 'Kunal Kapoor': 181, 'Liz Torres': 182, 'Bill Clinton': 183, 'Guy Boyd': 184, 'Susan Tyrrell': 185, "John O'Leary": 186, 'Rossy de Palma': 187, 'Jack Nance': 188, 'Ron White': 189, 'Massimo Sarchielli': 190, 'Michael Dudikoff': 191, 'Pat Corley': 192, 'Ernest Borgnine': 193, 'Chris Haywood': 194, 'Brent Hinkley': 195, 'Masood Akhtar': 196, 'Hrithik Roshan': 197, 'Cybill Shepherd': 198, 'Kirron Kher': 199, 'Ione Skye': 200, "Raymond O'Connor": 201, 'Alison Elliott': 202, 'E. Katherine Kerr': 203, 'Ashley Peldon': 204, 'Michael DeLorenzo': 205, 'Philippe Morier-Genoud': 206, 'Dennis Letts': 207, 'Herbert Lom': 208, 'Marilu Henner': 209, 'Vincent Kartheiser': 210, 'Robert Stephens': 211, 'Mackenzie Astin': 212, 'Rutanya Alda': 213, 'Tom Nolan': 214, 'Lara Dutta': 215, 'Kim Hunter': 216, 'Darcy DeMoss': 217, 'Charles Lane': 218, 'Panchito Gómez': 219, 'Polly Draper': 220, 'Rudy De Luca': 221, 'Beau Starr': 222, 'Darren McGavin': 223, 'Daniel Lapaine': 224, 'Stuart Margolin': 225, 'Santos Morales': 226, 'Shabana Azmi': 227, 'Chunky Panday': 228, 'Rajpal Naurang Yadav': 229, "De'aundre Bonds": 230, 'Susan Barnes': 231, 'Vic Polizos': 232, 'Tim Thomerson': 233, 'John Roselius': 234, 'Lillete Dubey': 235, 'Gary Grubbs': 236, 'Kane Hodder': 237, 'Carrie Snodgress': 238, 'Stephen Baldwin': 239, 'Joe Regalbuto': 240, 'David Harris': 241, 'Victor Argo': 242, 'H.B. Haggerty': 243, 'Jordan Baker': 244, 'Ajay Devgn': 245, 'Dan Shor': 246, 'Harley Cross': 247, 'Tupac Shakur': 248, 'Sid Haig': 249, 'John Dennis Johnston': 250, 'Joie Lee': 251, 'Carlin Glynn': 252, 'William Morgan Sheppard': 253, 'Ellen David': 254, "Olivia d'Abo": 255, 'Tuesday Weld': 256, 'Timothy Bottoms': 257, 'Meshach Taylor': 258, 'Eddie Albert': 259, 'Kristy McNichol': 260, 'Alex Colon': 261, 'Kareena Kapoor': 262, 'Ruth Sheen': 263, 'Isabelle Huppert': 264, 'Michael Schoeffling': 265, 'Tim Daly': 266, 'Sachin Khedekar': 267, 'Tim Woodward': 268, 'Mark Acheson': 269, 'Jonathan Silverman': 270, 'Neil Crone': 271, 'R.G. Armstrong': 272, 'Al Waxman': 273, 'Jack Wallace': 274, 'Barbara Sukowa': 275, 'Tico Wells': 276, 'Kirk Cameron': 277, 'Thierry Lhermitte': 278, 'Joanne Baron': 279, 'Jason Priestley': 280, 'John Abraham': 281, 'Beah Richards': 282, 'Jean-Claude Dreyfus': 283, 'John Schneider': 284, 'Tonya Pinkins': 285, 'Meg Tilly': 286, 'John Tormey': 287, 'Catherine Deneuve': 288, 'Katrin Cartlidge': 289, 'Daniel Giménez Cacho': 290, 'Matthew Laurance': 291, 'Louis Giambalvo': 292, 'Miles Chapin': 293, 'Craig Sheffer': 294, "Tom O'Brien": 295, 'Stoney Jackson': 296, 'Paul Shenar': 297, 'Lee Grant': 298, 'Sheeba Chaddha': 299, 'Louis Mustillo': 300, 'Julian Sands': 301, 'Anne-Marie Duff': 302, 'Erin Darke': 303, 'Akshay Kumar': 304, 'Jim Moody': 305, 'Anthony Holland': 306, 'Faye Grant': 307, 'Clayton Landey': 308, 'Peter Ustinov': 309, 'Nicholas Campbell': 310, 'Paul Bartel': 311, 'Alan Fudge': 312, 'Pankaj Tripathi': 313, 'Mithun Chakraborty': 314, 'Natalija Nogulich': 315, 'Richard Benjamin': 316, 'Bruce Payne': 317, 'Joe Seneca': 318, 'Theodore Bikel': 319, 'Richard Romanus': 320, 'Gerrit Graham': 321, 'Ken Campbell': 322, 'Nadim Sawalha': 323, 'William Newman': 324, 'Victoria Tennant': 325, 'Suresh Menon': 326, 'Iggy Pop': 327, 'John LaMotta': 328, 'Clu Gulager': 329, 'Kenneth McMillan': 330, 'Bernie McInerney': 331, 'Ted McGinley': 332, 'Ben Johnson': 333, 'Mira Sorvino': 334, 'Chloë Sevigny': 335, 'Julie Delpy': 336, 'John Barrett': 337, 'Gerry Black': 338, 'James Cada': 339, 'Vlasta Vrana': 340, 'Harry Northup': 341, 'Keenan Wynn': 342, 'Mia Farrow': 343, 'Ellis Williams': 344, 'Dennis Dun': 345, 'Phil Fondacaro': 346, 'Sylvia Miles': 347, 'Alia Bhatt': 348, 'Mary Woronov': 349, 'Riteish Deshmukh': 350, 'Lawrence Dane': 351, 'Peter Arne': 352, 'Scott Coffey': 353, 'Michel Blanc': 354, 'David Bowie': 355, 'Jason London': 356, 'Joseph Ragno': 357, 'Corbin Bernsen': 358, 'Chris Spencer': 359, 'Joanne Whalley': 360, 'Vincent Perez': 361, 'Fabrice Luchini': 362, 'Armin Shimerman': 363, 'Lee de Broux': 364, 'James Stewart': 365, 'Angélica Aragón': 366, 'Shah Rukh Khan': 367, 'Govardhan Asrani': 368, 'Bijou Phillips': 369, 'Moses Gunn': 370, 'Corey Haim': 371, 'Annie McEnroe': 372, 'Cliff De Young': 373, 'Ken Pogue': 374, 'André Maranne': 375, 'Mohd. Zeeshan Ayyub': 376, 'Michael Cassidy': 377, 'Satish Shah': 378, 'Rachael Leigh Cook': 379, 'Bill Buell': 380, 'Rishi Kapoor': 381, 'Jessica Lundy': 382, 'Jeffrey Combs': 383, 'Sal Lopez': 384, 'Lewis Arquette': 385, 'Marceline Hugot': 386, 'Peter Vaughan': 387, 'Paul Butler': 388, 'Cliff Gorman': 389, 'Michelle Meyrink': 390, 'Jackie Shroff': 391, 'Cooper Huckabee': 392, 'Bipasha Basu': 393, 'Lorraine Bracco': 394, 'Keenen Ivory Wayans': 395, 'Julie Payne': 396, 'Donald Pleasence': 397, 'Fernando Rey': 398, 'Richard Belzer': 399, 'Kim Wayans': 400, 'Shelley Winters': 401, 'Michael Tucker': 402, 'David de Keyser': 403, 'Pawan Malhotra': 404, 'Jacqueline Bisset': 405, 'Royal Dano': 406, 'Luke Edwards': 407, 'José Ferrer': 408, 'Dov Tiefenbach': 409, 'Mary Stuart Masterson': 410, 'Bill Sage': 411, 'Molly Ringwald': 412, 'Justin Edwards': 413, 'Arshad Warsi': 414, 'Anthony Johnson': 415, 'Malaika Arora': 416, 'Frederic Forrest': 417, 'Toshirô Mifune': 418, 'Bert Remsen': 419, 'Arun Bali': 420, 'Laura Ramsey': 421, 'Sarah Polley': 422, 'Julius Harris': 423, 'Michael Greene': 424, 'Charles McKeown': 425, 'Steven Weber': 426, 'Steve Antin': 427, 'Michael McKean': 428, 'Peter Sellers': 429, 'Ned Bellamy': 430, 'Peter Kwong': 431, 'Abhishek Bachchan': 432, 'John Vernon': 433, 'Chuck Cooper': 434, 'Earl Billings': 435, 'Fanny Ardant': 436, 'Gene Davis': 437, 'Michael Paré': 438, 'Thomas Hill': 439, 'Michael J. Reynolds': 440, 'Andre Gregory': 441, 'Daphne Zuniga': 442, 'Henry Fonda': 443, 'Darlanne Fluegel': 444, 'Trini Alvarado': 445, 'Christine Lahti': 446, 'Sally Kellerman': 447, 'Jodi Long': 448, 'Louis Guss': 449, 'Mickey Jones': 450, 'Christopher Malcolm': 451, 'Arjun Rampal': 452, 'Albert Salmi': 453, 'Ice-T': 454, 'Arlen Dean Snyder': 455, 'Pauline Collins': 456, "Milo O'Shea": 457, 'Tom Wright': 458, 'Danny Aiello': 459, 'Juhi Chawla': 460, 'George Burns': 461, 'Newell Alexander': 462, 'Phil Davis': 463, 'Sean McCann': 464, 'T.K. Carter': 465, 'Naomi Campbell': 466, 'Shawn Hatosy': 467, 'Piper Laurie': 468, 'Judd Nelson': 469, 'Jane Hallaren': 470, 'Steve Forrest': 471, 'Esai Morales': 472, 'Louise Fletcher': 473, 'Maria Conchita Alonso': 474, 'Mia Kirshner': 475, 'Sônia Braga': 476, 'Matthew Cowles': 477, 'Joe Unger': 478, 'Dagmara Dominczyk': 479, 'Allan Arbus': 480, 'RuPaul': 481, 'Lee Ving': 482, 'Sy Richardson': 483, 'Michael V. Gazzo': 484, 'Sammy Davis Jr.': 485, 'Maggie McCarthy': 486, 'Jenny Wright': 487, 'Ann Wedgeworth': 488, 'John Carradine': 489, 'Warren Clarke': 490, 'Carmine Caridi': 491, 'Jon Polito': 492, 'James N. Harrell': 493, 'Kathleen Wilhoite': 494, 'Manoj Joshi': 495, 'Bill Moseley': 496, 'Suzanne Shepherd': 497, 'George R. Robertson': 498, 'Geraldine Page': 499, 'Zakir Hussain': 500, 'Keith Carradine': 501, 'Robin Thomas': 502, 'Mark Webber': 503, 'Jack Elam': 504, 'Ari Graynor': 505, 'Severn Darden': 506, 'Janet MacLachlan': 507, 'Tony DiBenedetto': 508, 'Geneviève Bujold': 509, 'Ken Magee': 510, 'Joe Lisi': 511, 'Loyd Catlett': 512, 'Robert Carradine': 513, 'Michael Alldredge': 514, 'Jimmy Sheirgill': 515, 'Richard Masur': 516, 'Ian Bannen': 517, 'Laurence Olivier': 518, 'William Prince': 519, 'Richard B. Shull': 520, 'Karl Johnson': 521, 'Rita Moreno': 522, 'Anthony De Longis': 523, 'Sarah Trigger': 524, 'Freddie Jones': 525, 'Sanjay Mishra': 526, 'David Proval': 527, 'James Staley': 528, 'Tom Villard': 529, 'Donald Hotton': 530, 'Jackie Burroughs': 531, 'Amy Wright': 532, 'Richard Mulligan': 533, 'Diana Bellamy': 534, 'Mark Duplass': 535, 'Don Pugsley': 536, 'Fisher Stevens': 537, 'Ranbir Kapoor': 538, 'Levon Helm': 539, 'Adrian Dunbar': 540, 'James Wilby': 541, 'Jan Tríska': 542, 'Rae Dawn Chong': 543, 'Anne De Salvo': 544, 'Tony Lo Bianco': 545, 'Patricia Arquette': 546, 'Jonathan Brandis': 547, 'Alia Shawkat': 548, 'Anna Chancellor': 549, 'Nicholas Rowe': 550, 'Manny Perez': 551, 'Stacy Edwards': 552, 'Dennis Lipscomb': 553, 'Vic Tayback': 554, 'Sam McMurray': 555, 'Michael Laskin': 556, 'Priscilla Pointer': 557, 'Jan Rubes': 558, 'Lee Wilkof': 559, 'James Keach': 560, 'Ken Foree': 561, 'Spike Lee': 562, 'John Dunn-Hill': 563, 'Ben Gazzara': 564, 'Kent Broadhurst': 565, "Dick O'Neill": 566, 'James Urbaniak': 567, 'Boman Irani': 568, 'Michael Harding': 569, 'Jennifer Jason Leigh': 570, 'Roger Aaron Brown': 571, 'Jaaved Jaaferi': 572, 'Jane Kaczmarek': 573, 'Sam Wanamaker': 574, 'Kevin Conway': 575, 'Raleigh Bond': 576, 'Maribel Verdú': 577, 'Blue Deckert': 578, 'Georgann Johnson': 579, 'Richard Farnsworth': 580, 'Lisa Jane Persky': 581, 'Billie Whitelaw': 582, 'Lindsay Crouse': 583, 'Kajol': 584, 'Sonam Kapoor': 585, 'Nicole Beharie': 586, 'Bruce M. Fischer': 587, 'James Mason': 588, 'Aishwarya Rai Bachchan': 589, 'Pam Grier': 590, 'Richard Lynch': 591, 'Sonny Carl Davis': 592, 'Marsha Mason': 593, 'Peter Horton': 594, 'Steve McQueen': 595, 'Kabir Bedi': 596, 'Sully Boyar': 597, 'Audra Lindley': 598, 'Shelley Duvall': 599, 'Dick Van Patten': 600, 'Billy Beck': 601, 'Sanjay Dutt': 602, 'Dana Delany': 603, 'Kelly Jo Minter': 604, 'David Hart': 605, 'Jeremy Northam': 606, 'Don Hood': 607, 'Keith Coogan': 608, 'Ronald Guttman': 609, 'Stanley Brock': 610, 'Hart Bochner': 611, 'William Smith': 612, 'Peter Elliott': 613, 'Mike Moroff': 614, 'John Ritter': 615, 'Lolita Davidovich': 616, 'Ben Shenkman': 617, 'Bob Minor': 618, 'Todd Graff': 619, 'Bo Hopkins': 620, 'Mamie Gummer': 621, 'Maria Dizzia': 622, 'Ranjit Chowdhry': 623, 'Phoebe Cates': 624, 'Paul Benjamin': 625, 'Pauly Shore': 626, 'Del Close': 627, 'Miriam Colon': 628, 'Ally Sheedy': 629, 'Joanna Pacula': 630, 'Julie Warner': 631, 'Allan Corduner': 632, 'William Ragsdale': 633, 'Tom Heaton': 634, 'Norman Fell': 635, 'Mike Pniewski': 636, 'Vincent Price': 637, 'Renée Taylor': 638, 'Ed Grady': 639, 'Ian Gomez': 640, 'Nicollette Sheridan': 641, 'William Traylor': 642, 'Dana Wheeler-Nicholson': 643, 'Brian McNamara': 644, 'Method Man': 645, 'John Sessions': 646, 'Graham Stark': 647, 'Paul Le Mat': 648, 'Kenneth Tigar': 649, 'Woody Allen': 650, 'Marvin J. McIntyre': 651, 'Jacqueline Fernandez': 652, 'Romola Garai': 653, "Sean 'Diddy' Combs": 654, 'Eloy Casados': 655, 'Helen Hanft': 656, 'John Saxon': 657, 'Howard Hesseman': 658, 'Dirk Blocker': 659, 'Shelley Long': 660, 'David Bradley': 661, 'Brooke Shields': 662, 'Taylor Negron': 663, 'Meagen Fay': 664, 'Judith Ivey': 665, 'Alexandra Holden': 666, 'Phyllis Somerville': 667, 'Nathan Davis': 668, 'John Houseman': 669, 'George Plimpton': 670, 'Michael J. Pagan': 671, 'Marco Rodríguez': 672, 'Robert Vaughn': 673, 'Damon Wayans': 674, 'Karen Young': 675, 'Farrah Fawcett': 676, 'Billy Barty': 677, 'Gillian Jacobs': 678, 'Mark Bringelson': 679, 'Haviland Morris': 680, 'Hill Harper': 681, 'William Petersen': 682, "Annette O'Toole": 683, 'Clifford A. Pellow': 684, 'Irène Jacob': 685, 'Eric Schweig': 686, 'Mark Tandy': 687, 'Emilio Estevez': 688, 'Wilbur Fitzgerald': 689, 'Brenda Vaccaro': 690, 'Nelsan Ellis': 691, 'Ron Silver': 692, 'Edith Fields': 693, 'Malinda Williams': 694, 'Talia Balsam': 695, 'George Coe': 696, 'Gene Hartline': 697, 'J.W. Smith': 698, 'Peter MacNeill': 699, 'Sarita Choudhury': 700, 'Rupert Graves': 701, 'Carol Sutton': 702, 'Kim Greist': 703, 'Philip Bruns': 704, 'Joyce Brothers': 705, 'Nicholas Farrell': 706, 'Michael J. Pollard': 707, "Dan O'Herlihy": 708, 'Jon Foster': 709, 'Lee Richardson': 710, 'Katrina Kaif': 711, 'Willard E. Pugh': 712, 'Rupert Frazer': 713, 'Robert Webber': 714, 'Annabella Sciorra': 715, 'Zulay Henao': 716, 'John Doe': 717, 'Gailard Sartain': 718, 'Welker White': 719, 'Pepe Serna': 720, 'James Pickens Jr.': 721, 'Rosanna Arquette': 722, 'Spalding Gray': 723, 'Catherine McCormack': 724, 'Graham Jarvis': 725, 'Cynthia Stevenson': 726, 'Lauren Hutton': 727, 'Salman Khan': 728, 'Paula Garcés': 729, 'Eddie Cibrian': 730, 'Helen Lloyd Breed': 731, 'Jeffrey Nordling': 732, 'Denise Crosby': 733, 'Cindy Williams': 734, 'Kim Delaney': 735, 'James Whitmore': 736, 'Daniel Benzali': 737, 'Kathryn Erbe': 738, 'Burke Byrnes': 739, 'Nancy Marchand': 740, 'Paul Sorvino': 741, 'D.W. Moffett': 742, 'Bill Bellamy': 743, 'Ray Walston': 744, 'Amy Madigan': 745, 'Carl Lumbly': 746, 'L.Q. Jones': 747, 'Amy Irving': 748, 'Terence Kelly': 749, 'Al Fann': 750, 'Lane Smith': 751, 'Bono': 752, 'David Byrd': 753, 'Kai Wulff': 754, 'Sushant Singh Rajput': 755, 'Om Puri': 756, 'Mary J. Blige': 757, 'Beau Bridges': 758, 'Richard Bright': 759, 'Kevin Heffernan': 760, 'Sean Young': 761, 'Lyman Ward': 762, 'Klaus Kinski': 763, 'Dwier Brown': 764, 'Kevin McCarthy': 765, 'Mario Van Peebles': 766, 'David L. Lander': 767, 'Steven Randazzo': 768, 'Joseph Maher': 769, 'Hanns Zischler': 770, 'Ebbe Roe Smith': 771, 'Dey Young': 772, 'Cindy Pickett': 773, 'Faye Dunaway': 774, 'Frank Adonis': 775, 'Annabeth Gish': 776, 'Joan Chen': 777, 'Charles Levin': 778, 'Leslie Caron': 779, 'Roberto Sosa': 780, 'Brett Rice': 781, 'George W. Bush': 782, 'Louis Gossett Jr.': 783, 'James Eckhouse': 784, 'Warren Oates': 785, 'Carol Kane': 786, 'Arthur J. Nascarella': 787, 'Chelsea Field': 788, 'Tim Ware': 789, 'James Biberi': 790, 'Jérémie Renier': 791, 'Karina Arroyave': 792, 'Nancy Allen': 793, 'Tom Savini': 794, 'Dudley Moore': 795, 'Joe Santos': 796, "Michael O'Keefe": 797, 'Lauren London': 798, 'Jack Thibeau': 799, 'Andrew Robinson': 800, 'John Standing': 801, 'Obba Babatundé': 802, 'Gregory Walcott': 803, 'Todd Allen': 804, 'Gregory Hines': 805, 'Andrew McCarthy': 806, 'John Gallagher Jr.': 807, 'Peter Bromilow': 808, 'Darrell Larson': 809, 'Judy Davis': 810, 'George C. Scott': 811, 'Doris Roberts': 812, 'Sheri Moon Zombie': 813, 'James Gammon': 814, 'Schuyler Fisk': 815, 'Eddie Velez': 816, 'Natasha Gregson Wagner': 817, 'Robert Miano': 818, 'Isaac Hayes': 819, 'Anupam Kher': 820, 'Jason Beghe': 821, 'Kadeem Hardison': 822, 'Roger Cross': 823, 'Naushaad Abbas': 824, 'Judy Parfitt': 825, 'Connie Britton': 826, 'Rockets Redglare': 827, 'Janet Carroll': 828, 'Lisa Blount': 829, 'Max Wright': 830, "Patti D'Arbanville": 831, 'Chuck Shamata': 832, 'Larry Flash Jenkins': 833, 'Chuck McCann': 834, 'Kaki Hunter': 835, 'Alan North': 836, 'F. William Parker': 837, 'Stephen E. Miller': 838, 'Margaret Whitton': 839, 'Ranveer Singh': 840, 'Richard C. Sarafian': 841, 'Paresh Rawal': 842, 'Sheila Kelley': 843, 'John de Lancie': 844, 'Woodrow Parfrey': 845, 'Brian Keith': 846, 'Eric Bogosian': 847, 'David Hayman': 848, 'Diane Ladd': 849, 'Melanie Griffith': 850, 'Clifton James': 851, 'Helen Martin': 852, 'Fenella Woolgar': 853, 'Jerry Levine': 854, 'Louis Herthum': 855, 'Ludivine Sagnier': 856, 'David Selby': 857, 'Robert Swan': 858, 'Mel Winkler': 859, 'Candy Clark': 860, 'Gretchen Mol': 861, 'J. Kenneth Campbell': 862, 'John Hancock': 863, 'Neil Ross': 864, 'Jacqueline Brookes': 865, 'Danny Webb': 866, 'Sam Bottoms': 867, 'Penelope Allen': 868, 'Ken Lerner': 869, 'John Philbin': 870, 'J. Smith-Cameron': 871, 'John Sharian': 872, 'Essence Atkins': 873, 'Alex Kendrick': 874, 'Christopher Curry': 875, 'Dennis Burkley': 876, 'Christopher Abbott': 877, 'Naseeruddin Shah': 878, 'Frank Baker': 879, 'Juliet Stevenson': 880, 'George Wendt': 881, 'Brandon Smith': 882, 'Rony Clanton': 883, 'David Wohl': 884, 'Mike White': 885, 'Greta Scacchi': 886, 'Daniel Hugh Kelly': 887, 'Jerry Hardin': 888, 'Devon Sawa': 889, 'Ira Wheeler': 890, "Ryan O'Neal": 891, 'Fred Astaire': 892, 'Paula Jai Parker': 893, "Mo'Nique": 894, 'Willie Nelson': 895, 'Peter Strauss': 896, 'Bess Armstrong': 897, 'Lou Diamond Phillips': 898, 'Roseanne Barr': 899, 'Geoffrey Lewis': 900, 'Marc Lawrence': 901, 'André Dussollier': 902, 'Daniel Gerroll': 903, 'Angel David': 904, 'Edward Fox': 905, 'Tia Texada': 906, 'Bibi Besch': 907, 'Rick Overton': 908, 'Michael McGrady': 909, 'Lynn Redgrave': 910, 'Dyan Cannon': 911, 'William Holden': 912, 'Aidan Quinn': 913, 'Ellen Barkin': 914, 'John Terry': 915, 'Badja Djola': 916, 'Nicholas Guest': 917, 'Leslie Easterbrook': 918, 'Yaphet Kotto': 919, 'Herta Ware': 920, 'Maddie Corman': 921, 'Ajay Naidu': 922, 'Tamala Jones': 923, 'Catherine Hicks': 924, 'Adam LeFevre': 925, 'Art Hindle': 926, 'Bridget Fonda': 927, 'Eileen Ryan': 928, 'Ed Begley Jr.': 929, 'Nina Foch': 930, 'Raphael Sbarge': 931, 'Charles Cyphers': 932, 'Martin Savage': 933, 'George Wallace': 934, 'François Chau': 935, 'Franklyn Ajaye': 936, 'Ernie Lively': 937, 'Tito Larriva': 938, "Terry O'Quinn": 939, 'Diana Scarwid': 940, 'Michael Currie': 941, 'Burt Kwouk': 942, 'Geoffrey Arend': 943, 'Lance Kinsey': 944, 'Melissa Sagemiller': 945, 'Ellen Greene': 946, 'Phyllida Law': 947, 'Lisa Eichhorn': 948, 'Burt Lancaster': 949, 'Tom Bower': 950, 'Amitabh Bachchan': 951, 'Lilyan Chauvin': 952, 'John Gielgud': 953, 'Ed Lauter': 954, 'Kari Wuhrer': 955, 'J.C. Quinn': 956, 'Thomas Jefferson Byrd': 957, 'Duane Martin': 958, 'Scott Paulin': 959, 'Brad Sullivan': 960, 'Scott Wilson': 961, 'Patrick Cranshaw': 962, 'Martha Higareda': 963, 'James Hampton': 964, 'Becky Ann Baker': 965, 'Claire Bloom': 966, 'Ray Wise': 967, 'Billy Drago': 968, 'Ruth McCabe': 969, 'Jack Warden': 970, 'Susanna Thompson': 971, 'Wilfrid Hyde-White': 972, 'Scott Burkholder': 973, 'Rufus': 974, 'Scatman Crothers': 975, 'Brian Hooks': 976, 'Chris Sarandon': 977, 'Barry Primus': 978, 'Robert Joy': 979, 'Sharat Saxena': 980, 'Randall Batinkoff': 981, 'Efren Ramirez': 982, 'Paul Reubens': 983, 'Charles Haid': 984, 'Graham Beckel': 985, 'George Kennedy': 986, 'Duncan Fraser': 987, 'Matt Clark': 988, "Beverly D'Angelo": 989, 'Bud Cort': 990, 'Macy Gray': 991, 'Gwen McGee': 992, 'Martha Gehman': 993, 'Roxanne Hart': 994, 'Erich Anderson': 995, 'Felicity Huffman': 996, 'Brendan Sexton III': 997, 'Natasha Richardson': 998, 'Wayne Grace': 999, 'Tom Towles': 1000, ...}
Apply to data to get CastsRank. Here we total the rank value
def getCastsTeamRank(casts):
total = 0
for cast in casts:
if cast not in castRank.keys():
total += random.randint(1, 200)
continue
total += castRank[cast]
return total
data2 = data.copy()
data2['CastsRank'] = data2['Cast'].apply(getCastsTeamRank)
data2['CastsRank'].value_counts()
2055 6 0 5 17485 4 1806 4 1953 4 .. 27199 1 4658 1 27183 1 33941 1 31443 1 Name: CastsRank, Length: 7779, dtype: int64
corr = pearsonr(data2['CastsRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CastsRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CastsRank', y='Gross_worldwide', color='brown')
Pearsons correlation between CastsRank and Gross: 0.526
<AxesSubplot:xlabel='CastsRank', ylabel='Gross_worldwide'>
final['CastsRank'] = data2['CastsRank']
How about we take the rank means?
def getCastsTeamRank(casts):
length = len(casts)
if length == 0:
length = 1
total = 0
for cast in casts:
if cast in castRank.keys():
total += castRank[cast]
return total / length
data2 = data.copy()
data2['CastsRank'] = data2['Cast'].apply(getCastsTeamRank)
data2['CastsRank'].value_counts()
corr = pearsonr(data2['CastsRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CastsRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CastsRank', y='Gross_worldwide', color='brown')
Pearsons correlation between CastsRank and Gross: 0.513
<AxesSubplot:xlabel='CastsRank', ylabel='Gross_worldwide'>
Both are approximately identical correlation value of 0.51
Now we will extract more feature That is:
NumLeadActors
cast10Movies.sort_values(by='Mean', ascending=False, inplace=True)
top100Cast = list(cast10Movies['Cast'][0:100])
def getNumLeadActors(casts):
total = 0
for cast in casts:
if cast in top100Cast:
total += 1
return total
data2 = data.copy()
data2['NumLeadActors'] = data2['Cast'].apply(getNumLeadActors)
data2['NumLeadActors'].value_counts()
0 7988 1 574 2 103 3 34 4 22 5 15 7 5 9 4 8 3 6 2 11 2 Name: NumLeadActors, dtype: int64
corr = pearsonr(data2['NumLeadActors'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumLeadActors and Gross: %.3f' % corr)
data2.plot.scatter(x='NumLeadActors', y='Gross_worldwide', color='brown')
Pearsons correlation between NumLeadActors and Gross: 0.593
<AxesSubplot:xlabel='NumLeadActors', ylabel='Gross_worldwide'>
final['NumLeadActors'] = data2['NumLeadActors']
HasTop30Actors
cast10Movies.sort_values(by='Mean', ascending=False, inplace=True)
top50Cast = list(cast10Movies['Cast'][0:50])
def getHasTop30Actors(casts):
for cast in casts:
if cast in top50Cast:
return 1
return 0
data2 = data.copy()
data2['HasTop50Actors'] = data2['Cast'].apply(getHasTop30Actors)
corr = pearsonr(data2['HasTop50Actors'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between HasTop50Actors and Gross: %.3f' % corr)
data2.plot.scatter(x='HasTop50Actors', y='Gross_worldwide', color='brown')
Pearsons correlation between HasTop50Actors and Gross: 0.364
<AxesSubplot:xlabel='HasTop50Actors', ylabel='Gross_worldwide'>
final['HasTop50Actors'] = data2['HasTop50Actors']
Hypothesis: Does revenue depend on number of crews appear on movie's imdb website?
data2 = data.copy()
data2['NumCrews'] = data2['Crew'].apply(lambda x: len(x))
corr = pearsonr(data2['NumCrews'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumCrews and Gross: %.3f' % corr)
data2.plot.scatter(x='NumCrews', y='Gross_worldwide', color='brown')
Pearsons correlation between NumCrews and Gross: 0.190
<AxesSubplot:xlabel='NumCrews', ylabel='Gross_worldwide'>
final['NumCrews'] = data2['NumCrews']
crew = parseWithMoneyAndCount(data, 'Crew')
crew.sort_values(by='Count', ascending=False)
Crew | Total | Count | Mean | Median | |
---|---|---|---|---|---|
10649 | Woody Allen | 1192064607 | 43 | 2.772243e+07 | 14792779.0 |
1879 | Clint Eastwood | 3458362840 | 38 | 9.100955e+07 | 53572298.0 |
9561 | Stephen King | 1891234174 | 36 | 5.253428e+07 | 22759009.5 |
9715 | Steven Spielberg | 10743871515 | 34 | 3.159962e+08 | 288074136.5 |
5181 | John Hughes | 2930471621 | 33 | 8.880217e+07 | 49944325.0 |
... | ... | ... | ... | ... | ... |
4407 | James Leo Herlihy | 44801177 | 1 | 4.480118e+07 | 44801177.0 |
4406 | James Lee Burke | 5009305 | 1 | 5.009305e+06 | 5009305.0 |
4405 | James Lee Barrett | 126737428 | 1 | 1.267374e+08 | 126737428.0 |
4404 | James Lasdun | 2048740 | 1 | 2.048740e+06 | 2048740.0 |
10777 | Óskar Jónasson | 96262212 | 1 | 9.626221e+07 | 96262212.0 |
10778 rows × 5 columns
Number of movies each crew participate in
crew.sort_values(by='Count', ascending=False)[0:20].plot.bar(x='Crew', y='Count', color='green')
<AxesSubplot:xlabel='Crew'>
crew.sort_values(by='Mean', ascending=False)[0:20]
Crew | Total | Count | Mean | Median | |
---|---|---|---|---|---|
639 | Anthony Russo | 6844248984 | 5 | 1.368850e+09 | 1.153337e+09 |
5007 | Joe Russo | 6844248984 | 5 | 1.368850e+09 | 1.153337e+09 |
5004 | Joe Robert Cole | 1347597973 | 1 | 1.347598e+09 | 1.347598e+09 |
9486 | Stan Lee | 13024534758 | 12 | 1.085378e+09 | 9.784765e+08 |
5526 | Josh Cooley | 1073394593 | 1 | 1.073395e+09 | 1.073395e+09 |
5198 | John Knoll | 1056057720 | 1 | 1.056058e+09 | 1.056058e+09 |
3046 | Eric Guillon | 1034800131 | 1 | 1.034800e+09 | 1.034800e+09 |
557 | Angus MacLane | 1028570942 | 1 | 1.028571e+09 | 1.028571e+09 |
10351 | Victoria Strouse | 1028570942 | 1 | 1.028571e+09 | 1.028571e+09 |
5389 | Jon Watts | 2012094920 | 2 | 1.006047e+09 | 1.006047e+09 |
4760 | Jennifer Lee | 2864210897 | 3 | 9.547370e+08 | 1.281508e+09 |
8205 | Pierre Coffin | 3708124783 | 4 | 9.270312e+08 | 1.002783e+09 |
4229 | J.K. Rowling | 9255312560 | 10 | 9.255313e+08 | 9.155662e+08 |
6324 | Lori Forte | 877244782 | 1 | 8.772448e+08 | 8.772448e+08 |
10688 | Yi Liu | 870325439 | 1 | 8.703254e+08 | 8.703254e+08 |
8264 | Qun Dong | 870325439 | 1 | 8.703254e+08 | 8.703254e+08 |
4935 | Jing Wu | 870325439 | 1 | 8.703254e+08 | 8.703254e+08 |
4475 | Jan Duursema | 868390560 | 1 | 8.683906e+08 | 8.683906e+08 |
9571 | Stephen McFeely | 8655378624 | 10 | 8.655379e+08 | 5.322244e+08 |
526 | Andy Lanning | 863756051 | 1 | 8.637561e+08 | 8.637561e+08 |
There are lots writers and directors who participate only in few movies
releases4crew = crew[crew['Count'] > 4]
releases4crew.sort_values(by='Mean', ascending=False)[0:20]
Crew | Total | Count | Mean | Median | |
---|---|---|---|---|---|
639 | Anthony Russo | 6844248984 | 5 | 1.368850e+09 | 1.153337e+09 |
5007 | Joe Russo | 6844248984 | 5 | 1.368850e+09 | 1.153337e+09 |
9486 | Stan Lee | 13024534758 | 12 | 1.085378e+09 | 9.784765e+08 |
4229 | J.K. Rowling | 9255312560 | 10 | 9.255313e+08 | 9.155662e+08 |
9571 | Stephen McFeely | 8655378624 | 10 | 8.655379e+08 | 5.322244e+08 |
2494 | David Yates | 6021591899 | 7 | 8.602274e+08 | 9.344541e+08 |
1779 | Christopher Markus | 9400391739 | 11 | 8.545811e+08 | 6.447831e+08 |
6172 | Lee Unkrich | 3892224463 | 5 | 7.784449e+08 | 8.078179e+08 |
8173 | Philippa Boyens | 6663807561 | 9 | 7.404231e+08 | 9.474951e+08 |
489 | Andrew Stanton | 5672435426 | 8 | 7.090544e+08 | 7.308323e+08 |
6992 | Michael Arndt | 3359222246 | 5 | 6.718444e+08 | 2.861686e+08 |
1237 | Brian Lynch | 3203896208 | 5 | 6.407792e+08 | 5.549875e+08 |
9637 | Steve Kloves | 7669781515 | 12 | 6.391485e+08 | 8.384820e+08 |
1281 | Bruce Geller | 3173171913 | 5 | 6.346344e+08 | 6.827166e+08 |
1691 | Chris Renaud | 3168230230 | 5 | 6.336460e+08 | 5.431140e+08 |
4687 | Jeff Nathanson | 4425072143 | 7 | 6.321532e+08 | 3.521143e+08 |
8784 | Robert Wade | 4923637935 | 8 | 6.154547e+08 | 6.030417e+08 |
1680 | Chris McKenna | 3059989349 | 5 | 6.119979e+08 | 6.226741e+08 |
3256 | Fran Walsh | 6696215912 | 11 | 6.087469e+08 | 8.976901e+08 |
2593 | Derek Connolly | 3020187001 | 5 | 6.040374e+08 | 5.666528e+08 |
fig = plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.bar(data=crew.sort_values(by='Mean', ascending=False)[0:20], x='Crew', height='Mean')
plt.xticks(rotation=90)
plt.title("Mean Plot")
plt.subplot(1, 2, 2)
plt.bar(data=crew.sort_values(by='Median', ascending=False)[0:20], x='Crew', height='Median')
# crew.sort_values(by='Mean',ascending=False)[0:20].plot.bar(x='Crew',y='Mean')
plt.xticks(rotation=90)
plt.title("Median Plot")
plt.show()
The top seems not change very much
Now we will develop rank of crew
crewRank = dict()
releases4crew = releases4crew.sort_values(by='Mean').reset_index(drop=True)
for i, row in enumerate(releases4crew['Crew']):
crewRank[row] = i + 1
crewRank
{'Federico Fellini': 1, 'André Téchiné': 2, 'Ingmar Bergman': 3, 'Todd Solondz': 4, 'Errol Morris': 5, 'Warren Miller': 6, 'Bill Forsyth': 7, 'Krzysztof Kieslowski': 8, 'Krzysztof Piesiewicz': 9, 'Sally Potter': 10, 'Louis Malle': 11, 'Mark Monroe': 12, 'Henry James': 13, 'Alan Rudolph': 14, 'Zalman King': 15, 'Hanif Kureishi': 16, 'Atom Egoyan': 17, 'Naomi Foner': 18, 'François Truffaut': 19, 'Bille August': 20, 'Werner Herzog': 21, 'Desmond Nakano': 22, 'Alfred Hitchcock': 23, 'Agnieszka Holland': 24, 'Sam Peckinpah': 25, 'Olivier Assayas': 26, 'Harold Pinter': 27, 'Patrice Leconte': 28, 'Laura Jones': 29, 'Allan Scott': 30, 'Billy Wilder': 31, 'Don Coscarelli': 32, 'Whit Stillman': 33, 'Sergio Donati': 34, 'David Zelag Goodman': 35, 'James R. Silke': 36, 'Robin Bhatt': 37, 'Frank Cottrell Boyce': 38, 'Menahem Golan': 39, 'Michael Winterbottom': 40, 'Neal Jimenez': 41, 'Farhan Akhtar': 42, 'Denys Arcand': 43, 'Uwe Boll': 44, 'Eric Red': 45, 'J. Lee Thompson': 46, 'John Huston': 47, 'Stanley Donen': 48, 'Albert Brooks': 49, 'Sam Firstenberg': 50, 'Terry Jones': 51, 'Bob Rafelson': 52, 'Monica Mcgowan Johnson': 53, 'Aaron Norris': 54, 'Anvita Dutt': 55, 'Oren Moverman': 56, 'Gérard Brach': 57, 'Ken Hixon': 58, 'Darin Scott': 59, 'Ruth Prawer Jhabvala': 60, 'Christopher Guest': 61, 'Susan Seidelman': 62, 'Michael Thomas': 63, 'Ken Russell': 64, 'Christopher Crowe': 65, 'E. Max Frye': 66, 'John Irvin': 67, 'Mike Leigh': 68, 'David Lynch': 69, 'Richard Fleischer': 70, 'Paul Schrader': 71, 'Jérôme Tonnerre': 72, 'Jim Jarmusch': 73, 'Emilio Estevez': 74, 'Sergio Leone': 75, 'Albert Pyun': 76, 'Imtiaz Ali': 77, 'Nicole Holofcener': 78, 'Anurag Kashyap': 79, 'Michael Winner': 80, 'Eugene Levy': 81, 'Bill Norton': 82, 'James Ivory': 83, 'Randall Miller': 84, 'Mira Nair': 85, 'Michael Dowse': 86, 'Richard Pearce': 87, 'Robert Mandel': 88, 'John Dahl': 89, 'Priyadarshan': 90, 'Charlie Peters': 91, 'Jane Campion': 92, 'Roland Joffé': 93, 'Tom Ropelewski': 94, 'Sidney J. Furie': 95, 'Michael Radford': 96, 'Todd Haynes': 97, 'Carroll Ballard': 98, 'Gillian Armstrong': 99, 'Ulu Grosbard': 100, 'Fred Schepisi': 101, 'Stewart Raffill': 102, 'John Curran': 103, 'Davis Guggenheim': 104, 'John Sayles': 105, 'Tony Bill': 106, 'David Shaber': 107, 'Alan Ormsby': 108, 'Todd Graff': 109, 'David Mickey Evans': 110, 'Luciano Vincenzoni': 111, 'Ernest R. Dickerson': 112, 'Thom Eberhardt': 113, 'Clive Barker': 114, 'Peter Yates': 115, 'John Briley': 116, 'Jacques Audiard': 117, 'Bernardo Bertolucci': 118, 'William Shakespeare': 119, 'Brian Garfield': 120, 'Alan Bennett': 121, 'Christopher Cain': 122, 'Michael Schultz': 123, 'Ben Hecht': 124, 'John Boorman': 125, 'Rudy De Luca': 126, 'Martha Coolidge': 127, 'Douglas McGrath': 128, 'Franco Zeffirelli': 129, 'Lars von Trier': 130, 'Phil Joanou': 131, 'James Toback': 132, 'Bruce Robinson': 133, 'Sanjay Leela Bhansali': 134, 'Jeff Kanew': 135, 'E.M. Forster': 136, 'Sidney Lumet': 137, 'Rick Rosenthal': 138, 'Paul Mazursky': 139, 'Robert Townsend': 140, 'Michael Haneke': 141, 'Mike Binder': 142, 'Steven Pressfield': 143, 'Miguel Tejada-Flores': 144, 'Andrew Fleming': 145, 'Mike Figgis': 146, 'Susanne Bier': 147, 'John Frankenheimer': 148, 'Hal Ashby': 149, 'Siddharth Anand': 150, 'John Crowley': 151, 'Steve Rash': 152, 'Daniel Petrie': 153, 'Richard Eyre': 154, 'Richard Linklater': 155, 'Edward Tang': 156, 'Joe Camp': 157, 'Mark L. Lester': 158, 'Karan Johar': 159, 'Niranjan Iyengar': 160, 'Clifford Green': 161, 'Don Siegel': 162, 'Rohit Shetty': 163, 'Anees Bazmee': 164, 'Jean-Claude Carrière': 165, 'Farhad Samji': 166, 'Tim Metcalfe': 167, 'John R. Cherry III': 168, 'Robert Altman': 169, 'Michael Cimino': 170, 'Vidhu Vinod Chopra': 171, 'John Schlesinger': 172, 'Peter Bogdanovich': 173, 'Prakash Kapadia': 174, 'David Rayfiel': 175, 'Paul Brickman': 176, 'Jay Presson Allen': 177, 'Rick Famuyiwa': 178, 'Yunus Sajawal': 179, 'David Cronenberg': 180, 'Billy Bob Thornton': 181, 'Brent Goldberg': 182, 'Sajid': 183, 'Paul Dehn': 184, 'Barbet Schroeder': 185, 'Martin Ritt': 186, 'Sean McNamara': 187, "Pat O'Connor": 188, 'Jonathan Kaplan': 189, 'Howard Franklin': 190, 'James Carabatsos': 191, 'Bill Duke': 192, 'Jeremy Brock': 193, 'Blake Edwards': 194, 'Tom Holland': 195, 'Larry Gross': 196, 'Dan Gordon': 197, 'Robert Klane': 198, 'Neil Simon': 199, 'Gurinder Chadha': 200, 'Paul Michael Glaser': 201, 'Dean Riesner': 202, 'James Bridges': 203, 'Dennis Shryack': 204, 'Miguel Arteta': 205, 'Sean S. Cunningham': 206, 'Larry Cohen': 207, 'François Ozon': 208, 'Richard Loncraine': 209, 'Bill Lancaster': 210, 'Kevin Smith': 211, 'W.D. Richter': 212, 'Ernest Lehman': 213, 'Mary Agnes Donoghue': 214, 'Richard Attenborough': 215, 'Neil LaBute': 216, 'Norman Jewison': 217, 'David Newman': 218, 'Jon Erwin': 219, 'Stephen J. Rivele': 220, 'William Dear': 221, 'Brent Maddock': 222, 'Hugh Hudson': 223, 'Darryl Ponicsan': 224, 'George A. Romero': 225, 'Mark Pellington': 226, 'Aditya Chopra': 227, 'Charlie Kaufman': 228, 'Paul Mayeda Berges': 229, 'Nicholas Hytner': 230, 'Chuck Konzelman': 231, 'Cary Solomon': 232, 'Chris Matheson': 233, 'Brad Anderson': 234, 'Franklin J. Schaffner': 235, 'Bruce Beresford': 236, 'Jean-Claude Van Damme': 237, 'Richard Benjamin': 238, 'Gene Quintano': 239, 'John Stockwell': 240, 'Cheech Marin': 241, 'Jim Sheridan': 242, 'Marshall Brickman': 243, 'Tobe Hooper': 244, 'Ron Shelton': 245, 'Wim Wenders': 246, 'Fred Dekker': 247, 'Kar-Wai Wong': 248, 'Arthur Hiller': 249, 'Rob Zombie': 250, 'Nicholas Kazan': 251, 'Matthew Robbins': 252, 'Michael Pressman': 253, 'Bob Clark': 254, 'Tommy Chong': 255, 'Carl Reiner': 256, 'Robert Benton': 257, 'Carl Franklin': 258, 'Bruce A. Evans': 259, 'Woody Allen': 260, 'Dalton Trumbo': 261, 'Tom Mankiewicz': 262, 'Pedro Almodóvar': 263, 'Alan Parker': 264, 'Nancy Dowd': 265, 'Neil Jordan': 266, 'DJ Pooh': 267, 'Stephen Frears': 268, 'Albert Magnoli': 269, 'Reginald Hudlin': 270, 'Chris Rock': 271, 'Leon Capetanos': 272, 'Michael Cristofer': 273, 'Kevin Macdonald': 274, 'Ted Kotcheff': 275, 'Peter Dexter': 276, 'Stirling Silliphant': 277, 'John Irving': 278, 'Jeremiah S. Chechik': 279, 'Steve Zacharias': 280, 'Terry George': 281, 'Alan B. McElroy': 282, 'Andy Breckman': 283, 'Neal Israel': 284, 'Spike Lee': 285, 'Richard Lester': 286, 'Jonathan Bernstein': 287, 'Raynold Gideon': 288, 'Stan Dragoti': 289, 'David Hare': 290, 'Christopher Hampton': 291, 'Wayne Wang': 292, 'Peter Straughan': 293, 'Guy Hamilton': 294, 'Dwight H. Little': 295, 'Norman Steinberg': 296, 'Michael Ritchie': 297, 'Irwin Winkler': 298, 'Lewis Teague': 299, 'Stephen Kendrick': 300, 'Simon Wincer': 301, 'Don Jakoby': 302, 'James Gray': 303, 'Victor Salva': 304, 'Steve Miner': 305, 'Walter Hill': 306, 'Michael Hoffman': 307, 'Eric Bernt': 308, 'Paul McGuigan': 309, 'Michael Lehmann': 310, 'John Waters': 311, 'John Carpenter': 312, 'Robert Getchell': 313, 'Anders Thomas Jensen': 314, 'Roman Polanski': 315, 'Howard Deutch': 316, 'Milos Forman': 317, 'Herbert Ross': 318, 'Andrew Klavan': 319, 'Ice Cube': 320, 'John Schultz': 321, 'I.A.L. Diamond': 322, 'Abi Morgan': 323, 'Russell Mulcahy': 324, 'Dave Eggers': 325, 'Brett Leonard': 326, 'Ian McEwan': 327, 'Bo Goldman': 328, 'Grégory Levasseur': 329, 'Francis Veber': 330, 'David Seltzer': 331, 'Harry Elfont': 332, 'John Guillermin': 333, 'Gregory Widen': 334, 'Mick Garris': 335, 'Alastair Fothergill': 336, 'Robert Bolt': 337, 'David Loughery': 338, 'Don Mancini': 339, 'David Mamet': 340, 'Vijay Krishna Acharya': 341, 'Donald E. Westlake': 342, 'Elizabeth Chandler': 343, 'Jerry Juhl': 344, 'Douglas Day Stewart': 345, 'John Hodge': 346, 'Matthew Stone': 347, 'Ted Demme': 348, 'Jonathan Lynn': 349, 'Buck Henry': 350, 'Howard Zieff': 351, 'Alex Kendrick': 352, 'Gary Goldman': 353, 'Craig Gillespie': 354, 'John le Carré': 355, 'Harold Becker': 356, 'Jay Chandrasekhar': 357, 'Lewis Colick': 358, 'Dick Clement': 359, 'Deborah Kaplan': 360, 'Robert Ramsey': 361, 'Karen Janszen': 362, 'Peter Hedges': 363, 'Thomas Rickman': 364, 'Lem Dobbs': 365, 'Billy Crystal': 366, 'Luis Mandoki': 367, 'Steve Pink': 368, 'Paul Thomas Anderson': 369, 'Ian La Frenais': 370, 'Mark Rydell': 371, 'Iain Softley': 372, 'Gus Van Sant': 373, 'Allison Burnett': 374, 'Charles Edward Pogue': 375, 'Tina Gordon': 376, 'Joe Dante': 377, 'Terrence Malick': 378, 'John Milius': 379, 'Sofia Coppola': 380, 'David Lean': 381, 'John Kamps': 382, 'Dennis Feldman': 383, 'Jane Austen': 384, 'Ken Kwapis': 385, 'James Ellroy': 386, 'Ron Underwood': 387, 'Peter Hyams': 388, 'Victor Miller': 389, 'Andrew Bergman': 390, 'Akira Kurosawa': 391, 'Sheldon Lettich': 392, 'Elmore Leonard': 393, 'Gary Fleder': 394, 'Charles Shyer': 395, 'Michael Moore': 396, 'Terry Gilliam': 397, 'Heywood Gould': 398, 'Michael Caton-Jones': 399, 'Adam Rifkin': 400, 'Steven Knight': 401, 'Abhijat Joshi': 402, 'Mitch Glazer': 403, 'Joseph Barbera': 404, 'Don Bluth': 405, 'Rita M. Fink': 406, 'Harry Julian Fink': 407, 'William Friedkin': 408, 'Mark Neveldine': 409, 'Craig Bolotin': 410, 'Hugh Wilson': 411, 'Robert Towne': 412, 'Jean-Jacques Annaud': 413, "Gavin O'Connor": 414, 'Kathryn Bigelow': 415, 'Mark Brown': 416, 'Andrew Birkin': 417, 'Joe Roth': 418, 'J. Mills Goodloe': 419, 'Brian Robbins': 420, 'Scott Hicks': 421, 'Jonathan Demme': 422, 'Jeannot Szwarc': 423, 'Stephen Hopkins': 424, 'Roger Kumble': 425, 'Walon Green': 426, 'Eli Roth': 427, 'Carol Sobieski': 428, 'Stuart Gordon': 429, 'Hal Needham': 430, 'Ron Nyswaner': 431, 'Peter Chelsom': 432, 'Steve Martin': 433, 'Thomas Meehan': 434, 'Tom Tykwer': 435, 'Donald Petrie': 436, 'Stanley Kubrick': 437, 'Tom Stoppard': 438, 'George Tillman Jr.': 439, 'Ronald Harwood': 440, 'John G. Avildsen': 441, 'Ronny Yu': 442, 'Tyler Perry': 443, 'Danny DeVito': 444, 'Robert Redford': 445, 'Taylor Hackford': 446, 'David S. Ward': 447, 'Dale Launer': 448, 'Mary Shelley': 449, 'Brian Taylor': 450, 'Stephen King': 451, 'Joseph Ruben': 452, 'George Roy Hill': 453, 'Charles Martin Smith': 454, 'John Pogue': 455, 'Harris Goldberg': 456, 'Larry Ferguson': 457, 'Steven Brill': 458, 'John Patrick Shanley': 459, 'Robert King': 460, 'Chuck Pfarrer': 461, 'Sam Weisman': 462, 'Babaloo Mandel': 463, 'George Gallo': 464, 'Caroline Thompson': 465, 'Randal Kleiser': 466, 'Tom Schulman': 467, 'Richard Price': 468, 'William Goldman': 469, 'Jonathan Levine': 470, 'Josh Stolberg': 471, 'Peter Hewitt': 472, 'Rod Daniel': 473, 'Thomas Carter': 474, 'Jay Cocks': 475, 'Don Roos': 476, 'Phil Alden Robinson': 477, 'John Glen': 478, 'Francis Ford Coppola': 479, 'Wes Craven': 480, 'Alexandre Aja': 481, 'John Badham': 482, 'Larry Karaszewski': 483, 'Mel Brooks': 484, 'Debra Hill': 485, 'Alex Garland': 486, 'Scott Alexander': 487, 'Lowell Ganz': 488, 'Albert S. Ruddy': 489, 'James Schamus': 490, 'Wes Anderson': 491, 'Michael Grais': 492, 'Robert Wise': 493, 'John Landis': 494, 'Hark Tsui': 495, 'John Whitesell': 496, 'John Romano': 497, 'Rob Reiner': 498, 'Charles Dickens': 499, 'Andrzej Bartkowiak': 500, 'Malcolm D. Lee': 501, 'Les Mayfield': 502, 'David Webb Peoples': 503, 'Michel Gondry': 504, 'James Orr': 505, 'Michael Apted': 506, 'Sarah Kernochan': 507, 'Alan J. Pakula': 508, 'Nick Hornby': 509, 'Barry Levinson': 510, 'Curtis Hanson': 511, 'Stephen Herek': 512, 'Roger Donaldson': 513, 'Jon Avnet': 514, 'Wesley Strick': 515, 'Gregory Hoblit': 516, 'Nick Castle': 517, 'Roger Michell': 518, 'Joe Eszterhas': 519, 'George Clooney': 520, 'David Gordon Green': 521, 'Warren Beatty': 522, 'Pat Proft': 523, 'James McTeigue': 524, 'Greg Mottola': 525, 'Allen Hughes': 526, 'Frank Oz': 527, 'Mike Rich': 528, 'Lewis Gilbert': 529, 'Paul Rudnick': 530, 'S.S. Wilson': 531, 'John Fusco': 532, 'Larry McMurtry': 533, 'Leonard Nimoy': 534, 'Richard LaGravenese': 535, 'Jim Henson': 536, 'Aaron Seltzer': 537, 'Jason Friedberg': 538, 'Glen Morgan': 539, 'Mike White': 540, 'Penny Marshall': 541, 'Jon Amiel': 542, 'Mike Nichols': 543, 'Bill Walsh': 544, 'Diablo Cody': 545, 'Hal Barwood': 546, 'Mark Rosenthal': 547, 'Brian De Palma': 548, 'Richard Maibaum': 549, 'Gerald Di Pego': 550, 'Amy Holden Jones': 551, 'Albert Uderzo': 552, 'René Goscinny': 553, 'Penelope Spheeris': 554, 'George P. Cosmatos': 555, 'Lasse Hallström': 556, 'Jean-Pierre Jeunet': 557, 'David Levien': 558, 'Brian Koppelman': 559, 'Michael Miner': 560, 'David Zucker': 561, 'Ann Biderman': 562, 'Oliver Stone': 563, 'Keenen Ivory Wayans': 564, 'Robert Harling': 565, 'Jason Reitman': 566, 'Larry Gelbart': 567, 'Yimou Zhang': 568, 'Amy Heckerling': 569, 'Frank Pierson': 570, 'Daniel Waters': 571, 'Pierre Morel': 572, 'Jim Kouf': 573, 'Simon Wells': 574, 'Kirk Jones': 575, 'Philip Kaufman': 576, 'Nicholas Meyer': 577, 'Roger Spottiswoode': 578, 'Spike Jonze': 579, 'James DeMonaco': 580, 'Ethan Coen': 581, 'Joel Coen': 582, 'Edgar Rice Burroughs': 583, 'Michael Schiffer': 584, 'Leslie Bohem': 585, 'Herschel Weingrod': 586, 'Kunihiko Yuyama': 587, 'John Singleton': 588, 'Michael G. Wilson': 589, 'Albert Hughes': 590, 'Danny Boyle': 591, 'Harold Ramis': 592, 'Fred Wolf': 593, 'Michael Tolkin': 594, 'Satoshi Tajiri': 595, 'Gloria Katz': 596, 'Jessie Nelson': 597, 'David R. Ellis': 598, 'Mark Andrus': 599, 'David Twohy': 600, 'Renny Harlin': 601, 'James Foley': 602, 'Nora Ephron': 603, 'Jim Taylor': 604, 'Marcus Dunstan': 605, 'Patrick Melton': 606, 'Alexandre Dumas': 607, 'Allan Loeb': 608, 'Noah Baumbach': 609, 'Cameron Crowe': 610, 'Julian Fellowes': 611, 'Jeremy Leven': 612, 'Robin Swicord': 613, 'Mark Steven Johnson': 614, 'Delia Ephron': 615, 'Nicholas Sparks': 616, 'John Ridley': 617, 'Joel Schumacher': 618, 'Ted Griffin': 619, 'John Madden': 620, 'David Bowers': 621, 'Jim Abrahams': 622, 'Kaige Chen': 623, 'Catherine Hardwicke': 624, 'Jeb Stuart': 625, 'Peter Tolan': 626, 'Glenn Ficarra': 627, 'Andrew Davis': 628, 'Anthony Minghella': 629, 'Daniel Pyne': 630, 'George Cukor': 631, 'Terry Hayes': 632, 'William D. Wittliff': 633, 'Peter Weir': 634, 'Robert Rodriguez': 635, 'Martin Scorsese': 636, 'Tom S. Parker': 637, 'Ted Tally': 638, 'Marlon Wayans': 639, 'Willard Huyck': 640, 'Gary Winick': 641, 'John Hughes': 642, 'Wilfred Jackson': 643, 'Steve Carr': 644, 'Roger Avary': 645, 'Mark Waters': 646, 'William Hanna': 647, 'Nick Cassavetes': 648, 'Clint Eastwood': 649, 'Steven Soderbergh': 650, 'Jerry Belson': 651, 'Kenneth Lonergan': 652, 'Hossein Amini': 653, 'Ronald Bass': 654, 'George Miller': 655, 'Clyde Geronimi': 656, 'Arthur Conan Doyle': 657, 'Roald Dahl': 658, 'Lee Tamahori': 659, 'Sydney Pollack': 660, 'Richard Matheson': 661, 'Michael H. Weber': 662, 'Scott Neustadter': 663, 'Frank Darabont': 664, 'Edward Neumeier': 665, 'Christopher Landon': 666, 'Steven Rogers': 667, 'Andy Fickman': 668, 'Joel Cohen': 669, 'Michael Mann': 670, 'William Osborne': 671, 'Stephen Gaghan': 672, 'Paul Weitz': 673, 'Stieg Larsson': 674, 'Justin Haythe': 675, 'Johnny Knoxville': 676, 'Brian Helgeland': 677, 'Dominic Sena': 678, 'Garry Marshall': 679, 'Barry W. Blaustein': 680, 'Len Blum': 681, 'Dan Aykroyd': 682, 'Phillip Noyce': 683, 'Ed Decter': 684, 'Lawrence Konner': 685, 'Joe Wright': 686, 'Darren Aronofsky': 687, 'Kevin Williamson': 688, 'John Thomas': 689, 'Evan Goldberg': 690, 'David Dobkin': 691, 'Gregory Poirier': 692, 'Bobby Farrelly': 693, 'Audrey Wells': 694, 'Juliet Snowden': 695, 'Joe Carnahan': 696, 'Judd Apatow': 697, 'Frank Miller': 698, 'Harald Zwart': 699, 'Simon Pegg': 700, 'Dennis Lehane': 701, 'Nat Mauldin': 702, 'Menno Meyjes': 703, 'Ben Stiller': 704, 'Mario Puzo': 705, 'Ishirô Honda': 706, 'Emile Ardolino': 707, 'David O. Russell': 708, 'Timothy Harris': 709, 'Leslie Dixon': 710, 'Wolfgang Reitherman': 711, 'Gabriele Muccino': 712, 'Luc Besson': 713, 'Jason Segel': 714, 'Maya Forbes': 715, 'John Grisham': 716, 'Tim Story': 717, 'John Requa': 718, 'Aline Brosh McKenna': 719, 'Marc Lawrence': 720, 'Tarsem Singh': 721, 'Ian Fleming': 722, 'Jeffrey Reddick': 723, 'Brian Levant': 724, 'John J. Strauss': 725, 'Kurt Wimmer': 726, 'Richard Marquand': 727, 'D.J. Caruso': 728, 'Karen McCullah': 729, 'Kirsten Smith': 730, 'Seth Rogen': 731, 'Daniel Petrie Jr.': 732, 'Edward Zwick': 733, 'Kevin Reynolds': 734, 'Paul Attanasio': 735, 'Carl Gottlieb': 736, 'Robert Nelson Jacobs': 737, 'John McTiernan': 738, 'Denis Villeneuve': 739, 'Steven E. de Souza': 740, 'Guillermo del Toro': 741, 'Jez Butterworth': 742, 'P.J. Hogan': 743, 'Leigh Whannell': 744, 'Gavin Hood': 745, 'Alec Sokolow': 746, 'Antoine Fuqua': 747, 'Ken Kaufman': 748, 'David Frankel': 749, 'Mick Jackson': 750, 'John Cleese': 751, 'Jaume Collet-Serra': 752, 'Will Ferrell': 753, 'Jim Thomas': 754, 'Andrew Niccol': 755, 'Martin Brest': 756, 'Aaron Sorkin': 757, 'Adrian Lyne': 758, 'Larry Charles': 759, 'Pierre Boulle': 760, 'David Berenbaum': 761, 'Dana Fox': 762, 'William Peter Blatty': 763, 'Peter Segal': 764, 'Ang Lee': 765, 'Richard Donner': 766, 'Nancy Meyers': 767, 'Boaz Yakin': 768, 'Rob Cohen': 769, 'John Gatins': 770, 'J.F. Lawton': 771, 'Shawn Wayans': 772, 'Andrew Davies': 773, 'Peter Farrelly': 774, 'Karey Kirkpatrick': 775, 'Bruce Joel Rubin': 776, 'Matt Manfredi': 777, 'Chuck Russell': 778, 'Neil Gaiman': 779, 'Paul Hogan': 780, 'John Morris': 781, 'Michael Petroni': 782, 'Jeffrey Boam': 783, 'John Moore': 784, 'Charles Leavitt': 785, 'Dennis Dugan': 786, 'Donald E. Stewart': 787, 'Susannah Grant': 788, 'Phil Hay': 789, 'Michael Ferris': 790, 'Ivan Reitman': 791, 'Duane Adler': 792, 'Robert Luketic': 793, 'Alexander Payne': 794, 'Andrew Kevin Walker': 795, 'Eric Heisserer': 796, 'Adam Shankman': 797, 'Philip K. Dick': 798, "Dan O'Bannon": 799, 'Jerry Zucker': 800, 'Tony Scott': 801, 'Andy Tennant': 802, 'Peter Laird': 803, 'Sean Anders': 804, 'Ben Affleck': 805, 'Oren Peli': 806, 'Mark Frost': 807, 'William Nicholson': 808, 'Emma Thompson': 809, 'Anne Fletcher': 810, 'Eric Roth': 811, 'Robert Mark Kamen': 812, 'John Brancato': 813, 'Adam Cooper': 814, 'Bill Collage': 815, 'Gene Roddenberry': 816, 'Greg Berlanti': 817, 'John Lee Hancock': 818, 'Billy Ray': 819, 'Rich Wilkes': 820, 'Frank Coraci': 821, 'Ed Solomon': 822, 'Graham Yost': 823, 'Nicholas Stoller': 824, 'Peter Benchley': 825, 'Richard Wenk': 826, 'Betty Thomas': 827, 'Eddie Murphy': 828, 'Peter Berg': 829, 'Seth Gordon': 830, 'Alvin Sargent': 831, 'Ronald Shusett': 832, 'Tim Hill': 833, 'Marshall Herskovitz': 834, 'Paul W.S. Anderson': 835, 'Will Gluck': 836, 'Alejandro G. Iñárritu': 837, 'Paul Verhoeven': 838, 'Robert Rodat': 839, 'James L. Brooks': 840, 'Marc Silverstein': 841, 'Abby Kohn': 842, 'Damien Chazelle': 843, 'William Monahan': 844, 'Craig Pearce': 845, 'Sylvester Stallone': 846, 'David Ayer': 847, 'Hayao Miyazaki': 848, 'Robert Gordon': 849, 'J.M. Barrie': 850, 'Danilo Bach': 851, 'Tom McCarthy': 852, 'John Hamburg': 853, 'Doug Liman': 854, 'Kenneth Branagh': 855, 'Alex Proyas': 856, 'Quentin Tarantino': 857, 'Steven Zaillian': 858, 'James V. Hart': 859, 'Tim Herlihy': 860, 'Dan Gilroy': 861, 'Gary Ross': 862, 'Jack Epps Jr.': 863, 'Jim Cash': 864, 'Brad Silberling': 865, 'Jonathan Mostow': 866, 'Thomas Harris': 867, 'James Vanderbilt': 868, 'Scott Frank': 869, 'Mike Myers': 870, 'Chad Hayes': 871, 'Don Rhymer': 872, 'Steve Koren': 873, 'Scott Moore': 874, 'Tommy Swerdlow': 875, 'Timothy Dowling': 876, 'Mike Newell': 877, 'Scot Armstrong': 878, 'Steve Conrad': 879, 'Peter Baynham': 880, 'Edgar Wright': 881, 'Steve Oedekerk': 882, 'Ron Howard': 883, 'David Benioff': 884, 'Ridley Scott': 885, 'Jon M. Chu': 886, 'Gerry Swallow': 887, 'Martin Campbell': 888, 'Adam Sandler': 889, 'Skip Woods': 890, 'Adam McKay': 891, 'Joe Johnston': 892, 'Wolfgang Petersen': 893, 'Lorne Cameron': 894, 'Stephen Sommers': 895, 'James Mangold': 896, 'Paul Haggis': 897, 'Jay Roach': 898, 'George Nolfi': 899, 'Bob Gale': 900, 'Robert Schwentke': 901, 'Matthew Michael Carnahan': 902, 'Jonathan Hensleigh': 903, 'Marc Forster': 904, 'Baz Luhrmann': 905, 'John Woo': 906, 'Jon Lucas': 907, 'Thomas Lennon': 908, 'Paul Feig': 909, 'Simon West': 910, 'Tom Clancy': 911, 'Randall Wallace': 912, 'J. David Stem': 913, 'Kevin Jarre': 914, 'Scott Derrickson': 915, 'Louis Leterrier': 916, 'Len Wiseman': 917, 'McG': 918, 'Mike Werb': 919, 'Robert Ben Garant': 920, 'Raja Gosnell': 921, 'Craig Mazin': 922, 'Peter Morgan': 923, 'Tom Shadyac': 924, 'William Davies': 925, 'Simon Beaufoy': 926, 'David Lindsay-Abaire': 927, 'Chris Columbus': 928, 'Jay Scherick': 929, 'Richard Curtis': 930, 'Dan Fogelman': 931, 'David N. Weiss': 932, 'Jon Turteltaub': 933, 'David Fincher': 934, 'Brett Ratner': 935, 'Matt Reeves': 936, 'Barry Sonnenfeld': 937, 'Jonathan Liebesman': 938, 'Tony Gilroy': 939, 'Robert Zemeckis': 940, 'Shawn Levy': 941, 'Shane Black': 942, 'F. Gary Gray': 943, 'Sam Raimi': 944, 'Tim Burton': 945, 'Joe Shuster': 946, 'Paul Greengrass': 947, 'Marianne Wibberley': 948, 'Mel Gibson': 949, 'Laeta Kalogridis': 950, 'Erich Hoeber': 951, 'Jon Hoeber': 952, 'Robert Ludlum': 953, 'David Koepp': 954, 'Tab Murphy': 955, 'Akiva Goldsman': 956, 'William Broyles Jr.': 957, 'Ruben Fleischer': 958, 'Lilly Wachowski': 959, 'Lana Wachowski': 960, 'Jonathan Goldstein': 961, 'Alfred Gough': 962, 'Dean Devlin': 963, 'Cormac Wibberley': 964, 'M. Night Shyamalan': 965, 'Mark Bomback': 966, 'Lawrence Kasdan': 967, 'Jeffrey Price': 968, 'Jerry Siegel': 969, 'Mike Mitchell': 970, 'Jan de Bont': 971, 'John Francis Daley': 972, 'Brad Peyton': 973, 'Bill Condon': 974, 'John Logan': 975, 'Matthew Vaughn': 976, 'Alfonso Cuarón': 977, 'Peyton Reed': 978, 'Mark Burton': 979, 'Christopher Miller': 980, 'Drew Goddard': 981, 'Darren Lemke': 982, 'Ron Clements': 983, 'George Clayton Johnson': 984, 'Charles Perrault': 985, 'Jane Goldman': 986, 'John August': 987, 'Kelly Asbury': 988, 'Michael McCullers': 989, 'Phil Lord': 990, 'Scott Silver': 991, 'David S. Goyer': 992, 'Chris Weitz': 993, 'Peter S. Seaman': 994, 'Jake Kasdan': 995, 'Guy Ritchie': 996, 'Michael Green': 997, 'Marc Webb': 998, 'Etan Cohen': 999, 'Steven Spielberg': 1000, ...}
len(crewRank)
1091
Now we will rank the movies based on crew team rank
import random
def getCrewsTeamRank(crews):
total = 0
for crew in crews:
if crew not in crewRank.keys():
total += random.randint(1, 100)
continue
total += crewRank[crew]
return total
data2 = data.copy()
data2['crewsTeamRank'] = data2['Crew'].apply(getCrewsTeamRank)
data2['crewsTeamRank'].value_counts()
260 41 74 30 132 30 73 29 102 29 .. 2315 1 1239 1 637 1 1400 1 2606 1 Name: crewsTeamRank, Length: 2110, dtype: int64
corr = pearsonr(data2['crewsTeamRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between crewsTeamRank and Gross: %.3f' % corr)
data2.plot.scatter(x='crewsTeamRank', y='Gross_worldwide', color='brown')
Pearsons correlation between crewsTeamRank and Gross: 0.550
<AxesSubplot:xlabel='crewsTeamRank', ylabel='Gross_worldwide'>
final['crewsTeamRank'] = data2['crewsTeamRank']
Result correlation:
Now consider number of crew in top 150 for each movie
releases4crew.sort_values(by='Mean', ascending=False, inplace=True)
top150Crew = list(releases4crew['Crew'][0:150])
def getNumTopCrew(crews):
total = 0
for crew in crews:
if crew in top150Crew:
total += 1
return total
data2 = data.copy()
data2['NumTopCrew'] = data2['Crew'].apply(getNumTopCrew)
data2['NumTopCrew'].value_counts()
0 7911 1 555 2 187 3 76 4 21 5 2 Name: NumTopCrew, dtype: int64
corr = pearsonr(data2['NumTopCrew'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumTopCrew and Gross: %.3f' % corr)
data2.plot.scatter(x='NumTopCrew', y='Gross_worldwide', color='brown')
Pearsons correlation between NumTopCrew and Gross: 0.621
<AxesSubplot:xlabel='NumTopCrew', ylabel='Gross_worldwide'>
final['NumTopCrew'] = data2['NumTopCrew']
Consider film have crew in top 50
releases4crew.sort_values(by='Mean', ascending=False, inplace=True)
top50Crew = list(releases4crew['Crew'][0:50])
def getHasTopCrew(crews):
for crew in crews:
if crew in top50Crew:
return 1
return 0
data2 = data.copy()
data2['HasTopCrew'] = data2['Crew'].apply(getHasTopCrew)
data2['HasTopCrew'].value_counts()
0 8480 1 272 Name: HasTopCrew, dtype: int64
corr = pearsonr(data2['HasTopCrew'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between HasTopCrew and Gross: %.3f' % corr)
data2.plot.scatter(x='HasTopCrew', y='Gross_worldwide', color='brown')
Pearsons correlation between HasTopCrew and Gross: 0.544
<AxesSubplot:xlabel='HasTopCrew', ylabel='Gross_worldwide'>
final['HasTopCrew'] = data2['HasTopCrew']
Hypothesis: Is gross depends on number of Studios participate in?
data2 = data.copy()
data2['NumStudios'] = data2['Studios'].apply(lambda x: len(x))
data2['NumStudios'].value_counts()
3 5307 2 2088 1 1291 0 41 4 22 5 3 Name: NumStudios, dtype: int64
data2['NumStudios'].value_counts().reset_index().sort_values(by='index', ascending=True).plot.bar(x='index',
y='NumStudios')
corr = pearsonr(data2['NumStudios'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumStudios and Gross: %.3f' % corr)
data2.plot.scatter(x='NumStudios', y='Gross_worldwide', color='purple')
Pearsons correlation between NumStudios and Gross: 0.133
<AxesSubplot:xlabel='NumStudios', ylabel='Gross_worldwide'>
final['NumStudios'] = data2['NumStudios']
Correlation 0.112: Not very relevant
Movies with Number of Studios Of 4 or 5 may be outlier. We will try to drop it.
data3 = data2.drop(list(data[data2['NumStudios'] == 4].index), axis=0)
data3 = data3.drop(list(data[data2['NumStudios'] == 5].index), axis=0)
data3['NumStudios'].value_counts().reset_index().sort_values(by='index', ascending=True).plot.bar(x='index',
y='NumStudios')
corr = pearsonr(data3['NumStudios'], data3['Gross_worldwide'])[0]
print('Pearsons correlation between NumStudios and Gross: %.3f' % corr)
data3.plot.scatter(x='NumStudios', y='Gross_worldwide', color='purple')
Pearsons correlation between NumStudios and Gross: 0.135
<AxesSubplot:xlabel='NumStudios', ylabel='Gross_worldwide'>
Correlation stay the same so we don't need to delete them
Studio and Gross of the movies they work for
studio = parseWithMoneyAndCount(data, 'Studios')
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = studio.sort_values(by='Count', ascending=False)[0:15]
plt.bar(data=data2, x='Studios', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Number Of Movies", fontsize=10)
plt.title("Studio and Number Of Movies they work for", fontsize=15)
plt.show()
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = studio.sort_values(by='Total', ascending=False)[0:15]
plt.bar(data=data2, x='Studios', height='Total', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Total Gross", fontsize=10)
plt.title("Studio and Total Gross of Movies they work for", fontsize=15)
plt.show()
# plt.subplot(2,1,2)
fig = plt.figure(figsize=(8, 4))
data3 = studio.sort_values(by='Mean', ascending=False)[0:15]
plt.bar(data=data3, x='Studios', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Studio and Average Gross per movie they work for", fontsize=15)
plt.xlabel("Studio", fontsize=10)
plt.ylabel("Average Gross", fontsize=10)
plt.show()
fig = plt.figure(figsize=(8, 4))
data3 = studio.sort_values(by='Median', ascending=False)[0:15]
plt.bar(data=data3, x='Studios', height='Median', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Studio and Average Gross per movie they work for", fontsize=15)
plt.xlabel("Studio", fontsize=10)
plt.ylabel("Average Gross", fontsize=10)
plt.show()
studio.sort_values(by='Mean', ascending=False)
Studios | Total | Count | Mean | Median | |
---|---|---|---|---|---|
6417 | Truenorth Productions | 2069521700 | 1 | 2.069522e+09 | 2.069522e+09 |
3434 | Jason Roberts Productions | 2048359754 | 1 | 2.048360e+09 | 2.048360e+09 |
5820 | South Pictures | 2048359754 | 1 | 2.048360e+09 | 2.048360e+09 |
1021 | British Film Commission | 1662899439 | 1 | 1.662899e+09 | 1.662899e+09 |
6616 | Vita-Ray Dutch Productions (III) | 1153337496 | 1 | 1.153337e+09 | 1.153337e+09 |
... | ... | ... | ... | ... | ... |
4827 | Paramount Famous Lasky Corporation | 746 | 1 | 7.460000e+02 | 7.460000e+02 |
6659 | Walter Wanger Productions | 623 | 1 | 6.230000e+02 | 6.230000e+02 |
994 | Break Media | 528 | 1 | 5.280000e+02 | 5.280000e+02 |
1174 | Campfire | 528 | 1 | 5.280000e+02 | 5.280000e+02 |
3439 | Jaz Films | 95 | 1 | 9.500000e+01 | 9.500000e+01 |
6883 rows × 5 columns
This average gross plot for studio is not reliable to decide which Studio is big or not since there are some Studios which participate only in one Movie
We can define big studios by 4 ways:
Now we will exclude the Studio with releases lower than 5
studios10Larger = studio[studio['Count'] > 5]
studios10Larger = studios10Larger.sort_values(by='Mean', ascending=False)
fig = plt.figure(figsize=(8, 4))
plt.bar(data=studios10Larger[0:25], x='Studios', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Studio and Average Gross per movie they work for", fontsize=15)
plt.xlabel("Studio", fontsize=10)
plt.ylabel("Average Gross", fontsize=10)
plt.show()
studios10Larger
Studios | Total | Count | Mean | Median | |
---|---|---|---|---|---|
4117 | Marvel Studios | 23795322645 | 26 | 9.152047e+08 | 813667029.0 |
3830 | Lightstorm Entertainment | 6754939278 | 9 | 7.505488e+08 | 378882411.0 |
3226 | Illumination Entertainment | 6693236668 | 10 | 6.693237e+08 | 588661184.5 |
4979 | Pixar Animation Studios | 14528032320 | 22 | 6.603651e+08 | 601716911.5 |
6029 | Syncopy | 3775191007 | 6 | 6.291985e+08 | 597530912.5 |
... | ... | ... | ... | ... | ... |
6843 | Zenith Entertainment | 17321794 | 8 | 2.165224e+06 | 1708545.5 |
4453 | National Film Board of Canada (NFB) | 14559833 | 7 | 2.079976e+06 | 1601612.0 |
4693 | Ontario Film Development Corporation | 8636461 | 6 | 1.439410e+06 | 1364006.0 |
3677 | La Sept Cinéma | 10915352 | 8 | 1.364419e+06 | 1229040.0 |
1593 | Concorde Pictures | 8463581 | 7 | 1.209083e+06 | 1242995.0 |
595 rows × 5 columns
The top studios bar chat now looks more familiar.
Move on to rank the Studios.
studios10Larger.sort_values(by='Mean', ascending=True, inplace=True)
studioRank = dict()
for i, row in enumerate(studios10Larger['Studios']):
studioRank[row] = i + 1
studioRank
{'Concorde Pictures': 1, 'La Sept Cinéma': 2, 'Ontario Film Development Corporation': 3, 'National Film Board of Canada (NFB)': 4, 'Zenith Entertainment': 5, 'Duplass Brothers Productions': 6, 'A&E IndieFilms': 7, 'British Screen Productions': 8, 'RAI Radiotelevisione Italiana': 9, 'American Playhouse': 10, 'Films A2': 11, 'Sovereign Pictures': 12, 'Lorimar Motion Pictures': 13, 'Arte France Cinéma': 14, 'Diamond Docs': 15, 'MK2 Productions': 16, 'Arts Council of England': 17, 'Road Movies Filmproduktion': 18, 'The Rank Organisation': 19, 'October Films': 20, 'Alliance Communications Corporation': 21, 'Live Entertainment': 22, 'Les Films Ariane': 23, 'Trimark Pictures': 24, 'Triumph Films': 25, 'CBS Theatrical Films': 26, 'Big Indie Pictures': 27, 'Alliance Entertainment': 28, 'Fine Line Features': 29, 'Cineplex Odeon Films': 30, 'Les Films Alain Sarde': 31, 'Gener8Xion Entertainment': 32, 'Téléfilm Canada': 33, 'Rhombus Media': 34, 'Canadian Film Development Corporation (CFDC)': 35, 'CiBy 2000': 36, 'Vision PDG': 37, 'Lorimar Productions': 38, 'Merchant Ivory Productions': 39, 'Destination Films': 40, 'Balcor Film Investors': 41, 'Renaissance Films': 42, 'Celluloid Dreams': 43, 'New World Pictures': 44, 'Alliance Atlantis Communications': 45, 'Weintraub Entertainment Group': 46, 'Atlantic Entertainment Group': 47, 'Danmarks Radio (DR)': 48, 'Silver Screen Partners': 49, 'The Samuel Goldwyn Company': 50, 'Killer Films': 51, 'Avenue Pictures': 52, 'DD Productions': 53, 'Golan-Globus Productions': 54, 'De Laurentiis Entertainment Group (DEG)': 55, 'Bandai Visual Company': 56, 'CG Cinéma': 57, 'Det Danske Filminstitut': 58, 'Island Pictures': 59, 'Savoy Pictures': 60, 'Sony Pictures Classics': 61, 'Worldview Entertainment': 62, 'Film Victoria': 63, 'Price Entertainment': 64, 'Shree Ashtavinayak Cine Vision': 65, 'Cinémaginaire Inc.': 66, 'Hart Sharp Entertainment': 67, 'Cannon Films': 68, 'The Cannon Group': 69, 'Recorded Picture Company (RPC)': 70, 'The Australian Film Commission': 71, 'France 3 Cinéma': 72, 'A&M Films': 73, 'Delphi III Productions': 74, 'Kings Road Entertainment': 75, 'Muse Productions': 76, 'Serendipity Point Films': 77, 'The Mirisch Corporation': 78, 'Broad Green Pictures': 79, 'British Film Institute (BFI)': 80, 'Element Pictures': 81, 'Alive Films': 82, 'Distant Horizon': 83, 'Capitol Films': 84, 'Les Films du Losange': 85, 'Sweetland Films': 86, 'ML Delphi Premier Productions': 87, 'Amazon Studios': 88, 'FilmFour': 89, 'Good Machine': 90, 'Viacom18 Motion Pictures': 91, 'Likely Story': 92, 'Palace Pictures': 93, 'Gladden Entertainment': 94, 'Australian Film Finance Corporation (AFFC)': 95, 'Bavaria Film': 96, 'Rai Cinema': 97, 'Island World': 98, 'Excel Entertainment': 99, 'Isle of Man Film': 100, 'Artisan Entertainment': 101, 'Eros Worldwide': 102, 'MDP Worldwide': 103, 'John Wells Productions': 104, 'British Broadcasting Corporation (BBC)': 105, 'Producers Sales Organization (PSO)': 106, 'Element Films': 107, 'Detour Filmproduction': 108, 'Empire Pictures': 109, 'Epic Productions': 110, 'Embassy Pictures': 111, 'HanWay Films': 112, 'Pandora Filmproduktion': 113, 'Orly Films': 114, 'Nelvana': 115, 'TAFT Entertainment Pictures': 116, 'Screen Ireland': 117, 'The Bubble Factory': 118, 'IFC Productions': 119, 'Renn Productions': 120, 'Bórd Scannán na hÉireann': 121, 'Automatik Entertainment': 122, 'Redchillies.VFX': 123, "Centre national du cinéma et de l'image animée (CNC)": 124, 'Fandango': 125, 'ApolloMedia Distribution': 126, 'Goldcrest Films International': 127, 'Haut et Court': 128, 'ARTE': 129, 'Channel Four Films': 130, 'Cinerenta Medienbeteiligungs KG': 131, 'Brightlight Pictures': 132, 'Why Not Productions': 133, 'Propaganda Films': 134, 'Warner Independent Pictures (WIP)': 135, 'Nadiadwala Grandson Entertainment': 136, 'GreeneStreet Films': 137, 'France 2 Cinéma': 138, 'Fox STAR Studios': 139, 'Dharma Productions': 140, 'Red Chillies Entertainment': 141, 'Hemdale': 142, 'Kingsgate Films': 143, 'Enigma Productions': 144, 'Spelling Films': 145, 'Zentropa Entertainments': 146, 'Jack Rollins & Charles H. Joffe Productions': 147, 'Marty Katz Productions': 148, 'Depth of Field': 149, 'Lorimar Film Entertainment': 150, 'Incorporated Television Company (ITC)': 151, 'Sean S. Cunningham Films': 152, 'Film i Väst': 153, 'HandMade Films': 154, 'ABC Motion Pictures': 155, 'Provident Films': 156, 'Overture Films': 157, '2929 Productions': 158, 'Groundswell Productions': 159, 'RKO Pictures': 160, 'LD Entertainment': 161, 'Brooksfilms': 162, 'Vestron Pictures': 163, 'A24': 164, 'Big Beach Films': 165, 'Broadway Pictures': 166, 'Fox Atomic': 167, 'Delphi V Productions': 168, 'This Is That Productions': 169, 'Largo Entertainment': 170, 'Industry Entertainment': 171, "Hell's Kitchen Films": 172, 'Medusa Film': 173, 'Hawn / Sylbert Movie Company': 174, 'View Askew Productions': 175, 'First Artists': 176, '40 Acres & A Mule Filmworks': 177, 'Rysher Entertainment': 178, 'Orion Pictures': 179, 'Jackson/McHenry Company': 180, 'The': 181, 'H2L Media Group': 182, 'Franchise Pictures': 183, 'Pressman Film': 184, 'JVC Entertainment Networks': 185, 'Canal+ España': 186, 'Yash Raj Films': 187, 'Turner Pictures (I)': 188, 'IAC Films': 189, 'El Deseo': 190, 'Sidney Kimmel Entertainment': 191, 'UTV Motion Pictures': 192, 'Delphi II Productions': 193, 'IM Global': 194, 'BBC Films': 195, 'Gaumont': 196, 'Delphi IV Productions': 197, 'Capella International': 198, 'Smart Egg Pictures': 199, 'Brillstein-Grey Entertainment': 200, 'Trilogy Entertainment Group': 201, 'RadicalMedia': 202, 'Art Linson Productions': 203, 'Westdeutscher Rundfunk (WDR)': 204, 'Nelson Entertainment': 205, 'New Deal Productions': 206, 'The Geffen Company': 207, 'Amercent Films': 208, 'Bob Yari Productions': 209, 'X-Filme Creative Pool': 210, 'Zoetrope Studios': 211, 'Film Council': 212, 'Gaylord Films': 213, 'Touchwood Pacific Partners 1': 214, 'Bona Fide Productions': 215, 'National Lampoon': 216, 'SLM Production Group': 217, 'Permut Presentations': 218, 'Intrepid Pictures': 219, 'Silver Screen Partners II': 220, 'Pantelion Films': 221, 'Pathé Pictures International': 222, 'Turman-Foster Company': 223, 'Screen Australia': 224, 'Film4': 225, 'Sandollar Productions': 226, 'The Malpaso Company': 227, 'Ghoulardi Film Company': 228, 'Vinod Chopra Productions': 229, 'Waypoint Entertainment': 230, 'Bold Films': 231, 'Samuel Goldwyn Films': 232, 'Rastar Pictures': 233, 'UK Film Council': 234, 'Gramercy Pictures (I)': 235, 'American Zoetrope': 236, 'FilmEngine': 237, 'Wild Bunch': 238, 'MWM Studios': 239, 'Morgan Creek Entertainment': 240, 'The Ladd Company': 241, 'Lions Gate Films': 242, 'Indian Paintbrush': 243, 'Affirm Films': 244, 'Dino De Laurentiis Company': 245, 'Robert Evans Company': 246, 'Cheyenne Enterprises': 247, 'Caravan Pictures': 248, 'Cecchi Gori Group Tiger Cinematografica': 249, 'Melvin Simon Productions': 250, 'Anonymous Content': 251, 'BRON Studios': 252, 'Imagenation Abu Dhabi FZ': 253, 'Disneynature': 254, 'IndieProd Company Productions': 255, 'Winkler Films': 256, 'Exclusive Media Group': 257, "Cinema '84": 258, 'Mr. Mudd': 259, 'Broadway Video': 260, 'DNA Films': 261, 'Rogue Pictures': 262, 'Hyde Park Entertainment': 263, 'EMI Films': 264, 'Ruby Films': 265, 'Fox Searchlight Pictures': 266, 'Canal+': 267, 'Clinica Estetico': 268, 'HBO Films': 269, 'Rastar Films': 270, 'Outlaw Productions (I)': 271, 'Mad Chance': 272, 'Gran Via Productions': 273, 'Section Eight': 274, 'FilmNation Entertainment': 275, 'Open Road Films (II)': 276, 'Hollywood Pictures': 277, 'Motion Picture Corporation of America (MPCA)': 278, 'Endgame Entertainment': 279, 'Voltage Pictures': 280, 'Epsilon Motion Pictures': 281, 'Wildwood Enterprises': 282, 'Les Productions Artistes Associés': 283, 'State Street Pictures': 284, 'Odyssey Entertainment': 285, 'Stage 6 Films': 286, 'Perdido Productions': 287, 'Mandalay Entertainment': 288, 'Miramax': 289, 'Geffen Pictures': 290, 'Interscope Communications': 291, 'Cinema Group Ventures': 292, 'DiNovi Pictures': 293, 'American Empirical Pictures': 294, 'Participant': 295, 'Pathé': 296, 'Alcor Films': 297, 'Davis-Films': 298, 'Dark Castle Entertainment': 299, 'Tyler Perry Studios': 300, 'Gravier Productions': 301, 'David Foster Productions': 302, 'Estudios Churubusco Azteca S.A.': 303, 'Black Bear Pictures': 304, 'MTV Films': 305, 'Castle Rock Entertainment': 306, 'Beacon Communications': 307, 'Sprockets Music': 308, 'Barwood Films': 309, 'Gran Via': 310, 'Tapestry Films': 311, 'Polygram Filmed Entertainment': 312, 'CBS Films': 313, 'Lakeshore Entertainment': 314, 'Toei Company': 315, 'The Tyler Perry Company': 316, 'Spring Creek Productions': 317, 'Bel Air Entertainment': 318, 'Paramount Vantage': 319, 'Aamir Khan Productions': 320, 'TriStar Pictures': 321, 'Hanna-Barbera Productions': 322, 'The Montecito Picture Company': 323, 'Misher Films': 324, 'Major Studio Partners': 325, 'Rainforest Films': 326, 'Laurence Mark Productions': 327, 'Baltimore Pictures': 328, 'Beacon Pictures': 329, 'Phoenix Pictures': 330, 'Bazelevs Production': 331, 'Saturn Films': 332, 'Screen Gems': 333, 'St. Petersburg Clearwater Film Commission': 334, 'Intermedia Films': 335, 'Ixtlan': 336, 'Walt Disney Productions': 337, 'Lawrence Bender Productions': 338, 'Robert Simonds Productions': 339, 'United Artists': 340, 'Focus Features': 341, 'Dimension Films': 342, 'River Road Entertainment': 343, 'Pariah': 344, 'Cube Vision': 345, 'PolyGram Filmed Entertainment': 346, 'Jersey Films': 347, 'Emmett/Furla/Oasis Films (EFO Films)': 348, 'Double Feature Films': 349, 'Warner Bros. Animation': 350, 'Disney Television Animation': 351, 'Entertainment One': 352, 'Carolco Pictures': 353, 'Star Partners II Ltd.': 354, 'Lawrence Gordon Productions': 355, 'Zucker Brothers Productions': 356, '3 Arts Entertainment': 357, 'Tollin/Robbins Productions': 358, 'Playtone': 359, '4 Kids Entertainment': 360, 'Delphi Films': 361, 'Face Productions': 362, 'Imagine Films Entertainment': 363, 'Newmarket Capital Group': 364, 'Shangri-La Entertainment': 365, 'Conundrum Entertainment': 366, 'Golden Harvest Company': 367, 'Cinergi Pictures Entertainment': 368, 'TF1 Films Production': 369, 'Amen Ra Films': 370, 'Home Box Office (HBO)': 371, 'Mandate Pictures': 372, 'Icon Entertainment International': 373, 'The Guber-Peters Company': 374, 'CJ Entertainment': 375, 'Strike Entertainment': 376, 'Mirage Enterprises': 377, 'Tribeca Productions': 378, 'Plan B Entertainment': 379, 'Silver Screen Partners IV': 380, 'Silver Screen Partners III': 381, 'The Weinstein Company': 382, 'Mandalay Pictures': 383, 'Mayhem Pictures': 384, 'FilmColony': 385, 'Alcon Entertainment': 386, 'Allied Filmmakers': 387, 'STX Entertainment': 388, 'StudioCanal': 389, 'Huayi Brothers Media': 390, 'Will Packer Productions': 391, 'Annapurna Pictures': 392, 'China Film Co-Production Corporation': 393, 'Touchstone Pictures': 394, 'Working Title Films': 395, 'DENTSU Music And Entertainment': 396, 'Dune Entertainment III': 397, 'FilmDistrict': 398, 'Gerber Pictures': 399, 'Film Workshop': 400, 'Warner Bros. Family Entertainment': 401, 'The Mark Gordon Company': 402, 'Metro-Goldwyn-Mayer (MGM)': 403, 'Dark Horse Entertainment': 404, 'Global Entertainment Productions GmbH & Company Medien KG': 405, 'Malpaso Productions': 406, 'Pacific Western': 407, 'Millennium Films': 408, 'Goldcrest Pictures': 409, 'Craven-Maddalena Films': 410, 'Avnet/Kerner Productions': 411, 'Majestic Films International': 412, 'Revolution Studios': 413, 'Contrafilm': 414, "Mel's Cite du Cinema": 415, 'Initial Entertainment Group (IEG)': 416, 'Constantin Film': 417, 'Brad Grey Pictures': 418, 'Cold Spring Pictures': 419, 'Karz Entertainment': 420, 'Icon Productions': 421, 'Mutual Film Company': 422, 'Anton': 423, 'EuropaCorp': 424, 'Gold Circle Films': 425, 'Avery Pix': 426, 'Good Universe': 427, 'Scott Rudin Productions': 428, 'Smokehouse Pictures': 429, 'Ghost House Pictures': 430, 'Laika Entertainment': 431, 'Hughes Entertainment': 432, 'Blumhouse Productions': 433, 'Appian Way': 434, 'Canadian Film or Video Production Tax Credit (CPTC)': 435, 'Escape Artists': 436, 'Broken Road Productions': 437, 'Marc Platt Productions': 438, 'Point Grey Pictures': 439, 'Tall Trees Productions': 440, 'Disneytoon Studios': 441, 'Chartoff-Winkler Productions': 442, 'BenderSpink': 443, 'Bluegrass Films': 444, 'Red Hour Films': 445, 'Black Label Media': 446, 'Cross Creek Pictures': 447, 'Mace Neufeld Productions': 448, 'Davis Entertainment': 449, 'Juno Pix': 450, 'Radar Pictures': 451, 'Lionsgate': 452, 'New Line Cinema': 453, 'Apatow Productions': 454, 'New Regency Productions': 455, 'A Band Apart': 456, 'Gordon Company': 457, 'Konrad Pictures': 458, 'Alphaville Films': 459, 'Twisted Pictures': 460, 'QED International': 461, 'De Line Pictures': 462, 'Kennedy Miller Productions': 463, '21 Laps Entertainment': 464, 'NPV Entertainment': 465, 'Imagine Entertainment': 466, 'Fox 2000 Pictures': 467, 'Offspring Entertainment': 468, 'Northern Lights Entertainment': 469, 'Protozoa Pictures': 470, 'Donner/Shuler-Donner Productions': 471, 'Reliance Entertainment': 472, 'Gary Sanchez Productions': 473, 'Scott Free Productions': 474, 'Flower Films (II)': 475, 'Paramount Pictures': 476, 'TIK Films': 477, 'Thunder Road Pictures': 478, 'Spyglass Entertainment': 479, 'Columbia Pictures': 480, 'Studio Ghibli': 481, 'Silver Pictures': 482, 'Happy Madison Productions': 483, 'Universal Pictures': 484, 'The Zanuck Company': 485, 'Forward Pass': 486, 'Mosaic': 487, 'Gracie Films': 488, 'Nickelodeon Movies': 489, 'Roger Birnbaum Productions': 490, 'Summit Entertainment': 491, 'Relativity Media': 492, 'Gunn Films': 493, 'Walden Media': 494, 'Twentieth Century Fox': 495, 'Warner Bros.': 496, 'Eddie Murphy Productions': 497, 'Red Wagon Entertainment': 498, 'Red Granite Pictures': 499, 'Nippon Television Network (NTV)': 500, 'Jerry Weintraub Productions': 501, 'Platinum Dunes': 502, 'Marv Films': 503, 'Brandywine Productions': 504, 'Troublemaker Studios': 505, 'Ingenious Film Partners': 506, 'Aardman Animations': 507, 'Virtual Studios': 508, 'Solana Films': 509, 'Tig Productions': 510, 'Mandeville Films': 511, 'Sony Pictures Entertainment (SPE)': 512, 'RatPac Entertainment': 513, 'Valhalla Motion Pictures': 514, 'Dreamworks Pictures': 515, 'Zanuck/Brown Productions': 516, 'Tim Burton Productions': 517, 'Perfect World Pictures': 518, 'Zide-Perry Productions': 519, 'The Bedford Falls Company': 520, "Donners' Company": 521, 'Team Todd': 522, 'ImageMovers': 523, 'Kopelson Entertainment': 524, 'Original Film': 525, 'Overbrook Entertainment': 526, 'Parkes/MacDonald Image Nation': 527, 'M6 Films': 528, 'Ingenious Media': 529, 'LStar Capital': 530, 'Hurwitz Creative': 531, 'Eon Productions': 532, 'Prime Focus': 533, 'Digital Image Associates': 534, 'Michael De Luca Productions': 535, 'Vertigo Entertainment': 536, 'Village Roadshow Pictures': 537, 'Cruise/Wagner Productions': 538, 'Di Bonaventura Pictures': 539, 'GK Films': 540, 'DMG Entertainment': 541, 'TSG Entertainment': 542, 'Don Simpson/Jerry Bruckheimer Films': 543, 'Grive Productions': 544, 'RatPac-Dune Entertainment': 545, 'Dentsu': 546, 'Chernin Entertainment': 547, 'The Kennedy/Marshall Company': 548, 'Media Rights Capital (MRC)': 549, 'Amblin Entertainment': 550, 'Blinding Edge Pictures': 551, 'Temple Hill Entertainment': 552, 'Sony Pictures Animation': 553, 'Dune Entertainment': 554, 'Bona Film Group': 555, 'Atlas Entertainment': 556, 'Green Hat Films': 557, 'Studio Babelsberg': 558, 'Walt Disney Animation Studios': 559, 'The Safran Company': 560, 'DreamWorks': 561, '1492 Pictures': 562, 'Skydance Media': 563, 'Sunswept Entertainment': 564, 'Maverick Films': 565, 'Color Force': 566, 'Walt Disney Pictures': 567, 'Alibaba Pictures': 568, 'Jerry Bruckheimer Films': 569, 'FortyFour Studios': 570, 'Legendary Entertainment': 571, 'Roth Films': 572, 'Animal Logic': 573, 'Twentieth Century Fox Animation': 574, 'The Saul Zaentz Company': 575, 'Marvel Enterprises': 576, 'Centropolis Entertainment': 577, 'DreamWorks Animation': 578, 'DC Entertainment': 579, 'Bad Robot': 580, 'China Film Group Corporation (CFGC)': 581, 'Blue Sky Studios': 582, 'Hasbro': 583, 'Danjaq': 584, 'Lucasfilm': 585, 'Pacific Data Images (PDI)': 586, 'Marvel Entertainment': 587, 'WingNut Films': 588, 'Laura Ziskin Productions': 589, 'Heyday Films': 590, 'Syncopy': 591, 'Pixar Animation Studios': 592, 'Illumination Entertainment': 593, 'Lightstorm Entertainment': 594, 'Marvel Studios': 595}
Apply by get the max rank in the list of Studios for each movie
The studio not appear in the list will be randomly choose rank between 0 and 200
import random
def getStudioRank(studios):
max = 0
for studio in studios:
if studio not in studioRank.keys():
continue
if studioRank[studio] > max:
max = studioRank[studio]
if max != 0:
return max
return random.randint(1, 200)
data2 = data.copy()
data2['StudioRank'] = data2['Studios'].apply(getStudioRank)
corr = pearsonr(data2['StudioRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between StudioRank and Gross: %.3f' % corr)
data2.plot.scatter(x='StudioRank', y='Gross_worldwide', color='blue')
Pearsons correlation between StudioRank and Gross: 0.392
<AxesSubplot:xlabel='StudioRank', ylabel='Gross_worldwide'>
Apply by get the total rank in the list of Studios for each movie
The studio not appear in the list will be randomly choose rank between 0 and 200
import random
def getStudioRank(studios):
total = 0
for studio in studios:
if studio not in studioRank.keys():
total += random.randint(1, 200)
continue
total += studioRank[studio]
return total
data2 = data.copy()
data2['StudioRank'] = data2['Studios'].apply(getStudioRank)
corr = pearsonr(data2['StudioRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between StudioRank and Gross: %.3f' % corr)
data2.plot.scatter(x='StudioRank', y='Gross_worldwide', color='blue')
Pearsons correlation between StudioRank and Gross: 0.473
<AxesSubplot:xlabel='StudioRank', ylabel='Gross_worldwide'>
final['StudioRank'] = data2['StudioRank']
Apply by get the mean rank in the list of Studios for each movie
The studio not appear in the list will be randomly choose rank between 0 and 200
import random
def getStudioRank(studios):
total = 0
length = len(studios)
if length == 0:
length = 1
for studio in studios:
if studio not in studioRank.keys():
total += random.randint(0, 200)
continue
total += studioRank[studio]
return total / length
data2 = data.copy()
data2['StudioRank'] = data2['Studios'].apply(getStudioRank)
corr = pearsonr(data2['StudioRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between StudioRank and Gross: %.3f' % corr)
data2.plot.scatter(x='StudioRank', y='Gross_worldwide', color='blue')
Pearsons correlation between StudioRank and Gross: 0.433
<AxesSubplot:xlabel='StudioRank', ylabel='Gross_worldwide'>
Decide to choose total rank
Now we will extract more feature That is:
NumTopStudios
studios10Larger.sort_values(by='Mean', ascending=False, inplace=True)
top100Studios = list(studios10Larger['Studios'][0:100])
def getNumTopStudios(studios):
total = 0
for studio in studios:
if studio in top100Studios:
total += 1
return total
data2 = data.copy()
data2['NumTopStudios'] = data2['Studios'].apply(getNumTopStudios)
data2['NumTopStudios'].value_counts()
0 7009 1 1248 2 433 3 62 Name: NumTopStudios, dtype: int64
corr = pearsonr(data2['NumTopStudios'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumTopStudios and Gross: %.3f' % corr)
data2.plot.scatter(x='NumTopStudios', y='Gross_worldwide', color='brown')
Pearsons correlation between NumTopStudios and Gross: 0.530
<AxesSubplot:xlabel='NumTopStudios', ylabel='Gross_worldwide'>
final['NumTopStudios'] = data2['NumTopStudios']
HasTopStudio
studios10Larger.sort_values(by='Mean', ascending=False, inplace=True)
top100Studios = list(studios10Larger['Studios'][0:30])
def getHasTopStudio(studios):
for studio in studios:
if studio in top100Studios:
return 1
return 0
data2 = data.copy()
data2['HasTopStudio'] = data2['Studios'].apply(getHasTopStudio)
data2['HasTopStudio'].value_counts()
corr = pearsonr(data2['HasTopStudio'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between HasTopStudio and Gross: %.3f' % corr)
data2.plot.scatter(x='HasTopStudio', y='Gross_worldwide', color='brown')
Pearsons correlation between HasTopStudio and Gross: 0.504
<AxesSubplot:xlabel='HasTopStudio', ylabel='Gross_worldwide'>
final['HasTopStudio'] = data2['HasTopStudio']
Hypothesis: Is gross depend on number of Production Companies in each movie?
data2 = data.copy()
data2['Countries'] = data2['Countries'].apply(lambda x: len(x))
data2['Countries'].value_counts()
1 5743 2 1873 3 676 4 269 5 114 6 45 7 10 8 10 0 5 9 4 11 2 19 1 Name: Countries, dtype: int64
corr = pearsonr(data2['Countries'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between Countries and Gross: %.3f' % corr)
data2.plot.scatter(x='Countries', y='Gross_worldwide', color='brown')
Pearsons correlation between Countries and Gross: 0.087
<AxesSubplot:xlabel='Countries', ylabel='Gross_worldwide'>
Very low correlation of 0.079
Gross of each countries and Total Gross
country = parseWithMoneyAndCount(data, 'Countries')
# country=country[country['Count']>20]
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = country.sort_values(by='Count', ascending=False)[0:20]
plt.bar(data=data2, x='Countries', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Releases", fontsize=20)
plt.title("Movie Releases By Country", fontsize=20)
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,2)
data2 = country.sort_values(by='Total', ascending=False)[0:20]
plt.bar(data=data2, x='Countries', height='Total', color="salmon")
plt.ylabel("Total Gross", fontsize=20)
plt.xticks(rotation=90, fontsize=10)
plt.xlabel("Countries", fontsize=20)
plt.title("Total Gross By Country", fontsize=20)
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = country.sort_values(by='Mean', ascending=False)[0:20]
plt.bar(data=data2, x='Countries', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Average Gross", fontsize=20)
plt.title("Average Gross By Country", fontsize=20)
plt.show()
## This use for QUESTION 8
list_country = list(country['Countries'])
This rank of average gross is not relevant since there are some countries with only least release count
country.sort_values(by='Mean', ascending=False)
Countries | Total | Count | Mean | Median | |
---|---|---|---|---|---|
6 | Bahamas | 616502912 | 1 | 6.165029e+08 | 616502912.0 |
32 | Fiji | 429632142 | 1 | 4.296321e+08 | 429632142.0 |
62 | Malta | 3027543520 | 11 | 2.752312e+08 | 240697856.0 |
67 | Morocco | 3758242989 | 14 | 2.684459e+08 | 151314187.0 |
72 | New Zealand | 10467742795 | 48 | 2.180780e+08 | 74271180.0 |
... | ... | ... | ... | ... | ... |
10 | Bhutan | 1792370 | 2 | 8.961850e+05 | 896185.0 |
93 | Soviet Union | 2311743 | 3 | 7.705810e+05 | 93292.0 |
35 | Georgia | 686704 | 1 | 6.867040e+05 | 686704.0 |
58 | Liberia | 555533 | 1 | 5.555330e+05 | 555533.0 |
39 | Haiti | 352296 | 1 | 3.522960e+05 | 352296.0 |
111 rows × 5 columns
We can see that almost movies are release in United States.
We will test that: Is movies release in United States will have higher gross compare to other films?
data2 = data.copy()
data2['isUnitedStates'] = data2['Countries'].apply(lambda x: 1 if "United States" in x else 0)
corr = pearsonr(data2['isUnitedStates'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between isUnitedStates and Gross: %.3f' % corr)
data2.plot.scatter(x='isUnitedStates', y='Gross_worldwide', color='brown')
Pearsons correlation between isUnitedStates and Gross: 0.129
<AxesSubplot:xlabel='isUnitedStates', ylabel='Gross_worldwide'>
final['isUnitedStates'] = data2['isUnitedStates']
Seems low correlation
Now we will exclude countries that have number of releases less than 100
country.sort_values(by='Mean', ascending=False)
release100Countries = country[country['Count'] >= 100]
release100Countries = release100Countries.sort_values(by='Mean', ascending=False)
release100Countries.plot.bar(x='Countries', y='Mean', color='green')
<AxesSubplot:xlabel='Countries'>
We will choose these list of Countries to extract a rank feature
release100Countries = release100Countries.sort_values(by='Mean', ascending=True).reset_index(drop=True)
countryRank = dict()
for i, row in enumerate(release100Countries['Countries']):
countryRank[row] = i + 1 ## Plus 1 in order to release the 0 position for another film
countryRank
{'India': 1, 'Italy': 2, 'France': 3, 'Spain': 4, 'Germany': 5, 'United States': 6, 'Canada': 7, 'United Kingdom': 8, 'Mexico': 9, 'Hong Kong': 10, 'Japan': 11, 'Australia': 12, 'China': 13}
Get rank by the maximum
def getCountryRank(countries):
max = 0
for country in countries:
if country not in countryRank.keys():
continue
if countryRank[country] > max:
max = countryRank[country]
if max == 0:
max = random.randint(1, 10)
return max
data2 = data.copy()
data2['CountryRank'] = data2['Countries'].apply(getCountryRank)
corr = pearsonr(data2['CountryRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CountryRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CountryRank', y='Gross_worldwide', color='blue')
Pearsons correlation between CountryRank and Gross: 0.153
<AxesSubplot:xlabel='CountryRank', ylabel='Gross_worldwide'>
final['CountryRank'] = data2['CountryRank']
By total
def getCountryRank(countries):
total = 0
for country in countries:
if country not in countryRank.keys():
continue
total += countryRank[country]
return total
data2 = data.copy()
data2['CountryRank'] = data2['Countries'].apply(getCountryRank)
corr = pearsonr(data2['CountryRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CountryRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CountryRank', y='Gross_worldwide', color='blue')
Pearsons correlation between CountryRank and Gross: 0.161
<AxesSubplot:xlabel='CountryRank', ylabel='Gross_worldwide'>
By in the list
def getCountryRank(countries):
for country in countries:
if country not in countryRank.keys():
continue
return 1
return 0
data2 = data.copy()
data2['CountryRank'] = data2['Countries'].apply(getCountryRank)
corr = pearsonr(data2['CountryRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CountryRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CountryRank', y='Gross_worldwide', color='blue')
Pearsons correlation between CountryRank and Gross: 0.034
<AxesSubplot:xlabel='CountryRank', ylabel='Gross_worldwide'>
How distribution release by Languague
language = parseWithMoneyAndCount(data, 'Languages')
data2 = language.sort_values(by='Total', ascending=False)[0:20]
fig = plt.figure(figsize=(8, 4))
plt.bar(data=data2, x='Languages', height='Count', color="salmon")
plt.ylabel("Releases", fontsize=20)
plt.xticks(rotation=90, fontsize=10)
plt.xlabel("Languages", fontsize=20)
plt.title("Release By Language", fontsize=20)
plt.show()
Nowadays, films almost have English. We will try to find that what is the film is spoken in English or another languages
data2 = data.copy()
data2['IsEnglish'] = data2['Languages'].apply(lambda x: 1 if 'English' in x else 0)
data2['IsEnglish'].value_counts()
1 8150 0 602 Name: IsEnglish, dtype: int64
corr = pearsonr(data2['IsEnglish'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between IsEnnglish and Gross: %.3f' % corr)
data2.plot.scatter(x='IsEnglish', y='Gross_worldwide', color='blue')
Pearsons correlation between IsEnnglish and Gross: 0.076
<AxesSubplot:xlabel='IsEnglish', ylabel='Gross_worldwide'>
Correlation of 0.076
final['IsEnglish'] = data2['IsEnglish']
Hypothesis: Is gross depend on number of keywords?
data2 = data.copy()
data2['NumKeywords'] = data2['Keywords'].apply(lambda x: len(x))
data2['NumKeywords'].value_counts()
5 8566 1 89 2 34 4 24 3 21 0 18 Name: NumKeywords, dtype: int64
corr = pearsonr(data2['NumKeywords'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumKeywords and Gross: %.3f' % corr)
data2.plot.scatter(x='NumKeywords', y='Gross_worldwide', color='blue')
Pearsons correlation between NumKeywords and Gross: 0.043
<AxesSubplot:xlabel='NumKeywords', ylabel='Gross_worldwide'>
Not relevant
How distribution of gross by Keywords
keyword = parseWithMoneyAndCount(data, 'Keywords')
fig = plt.figure(figsize=(8, 4))
data2 = keyword.sort_values(by='Count', ascending=False)[0:15]
plt.bar(data=data2, x='Keywords', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Number of movie releases with certain kind of keywords", fontsize=15)
plt.ylabel("Count", fontsize=15)
Text(0, 0.5, 'Count')
keyword.sort_values(by='Mean', ascending=False)[0:10]
Keywords | Total | Count | Mean | Median | |
---|---|---|---|---|---|
4833 | forest protection | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
2911 | cosmic | 2797501328 | 1 | 2.797501e+09 | 2.797501e+09 |
7144 | love affair | 2201647264 | 1 | 2.201647e+09 | 2.201647e+09 |
10605 | sailor's death | 2201647264 | 1 | 2.201647e+09 | 2.201647e+09 |
11980 | sunken ship | 2201647264 | 1 | 2.201647e+09 | 2.201647e+09 |
10649 | sanitation employee | 2069521700 | 1 | 2.069522e+09 | 2.069522e+09 |
3314 | death of recurring character | 2048359754 | 1 | 2.048360e+09 | 2.048360e+09 |
10574 | s.h.i.e.l.d. | 1518815515 | 1 | 1.518816e+09 | 1.518816e+09 |
2063 | car falling off a cliff | 1515341399 | 1 | 1.515341e+09 | 1.515341e+09 |
3521 | disney animated sequel | 1450026933 | 1 | 1.450027e+09 | 1.450027e+09 |
keyword.sort_values(by='Median', ascending=False)[0:10]
Keywords | Total | Count | Mean | Median | |
---|---|---|---|---|---|
4833 | forest protection | 2847246203 | 1 | 2.847246e+09 | 2.847246e+09 |
2911 | cosmic | 2797501328 | 1 | 2.797501e+09 | 2.797501e+09 |
7144 | love affair | 2201647264 | 1 | 2.201647e+09 | 2.201647e+09 |
11980 | sunken ship | 2201647264 | 1 | 2.201647e+09 | 2.201647e+09 |
10605 | sailor's death | 2201647264 | 1 | 2.201647e+09 | 2.201647e+09 |
10649 | sanitation employee | 2069521700 | 1 | 2.069522e+09 | 2.069522e+09 |
3314 | death of recurring character | 2048359754 | 1 | 2.048360e+09 | 2.048360e+09 |
10574 | s.h.i.e.l.d. | 1518815515 | 1 | 1.518816e+09 | 1.518816e+09 |
2063 | car falling off a cliff | 1515341399 | 1 | 1.515341e+09 | 1.515341e+09 |
3521 | disney animated sequel | 1450026933 | 1 | 1.450027e+09 | 1.450027e+09 |
These keywords have high mean and median except the truth that have only 1 count
count50Keywords = keyword[keyword['Count'] > 20]
count50Keywords.sort_values(by='Mean', ascending=False)
Keywords | Total | Count | Mean | Median | |
---|---|---|---|---|---|
11995 | superhero | 32041168607 | 61 | 5.252651e+08 | 370569774.0 |
1120 | based on comic book | 10474265400 | 25 | 4.189706e+08 | 230884728.0 |
7217 | magic | 10738490415 | 36 | 2.982914e+08 | 111927462.0 |
9413 | princess | 6618049857 | 25 | 2.647220e+08 | 55534455.0 |
6675 | king | 7831973626 | 30 | 2.610658e+08 | 57671894.0 |
... | ... | ... | ... | ... | ... |
8615 | one word title | 1203447244 | 68 | 1.769775e+07 | 6258238.5 |
12603 | topless female nudity | 609853613 | 37 | 1.648253e+07 | 3039587.0 |
7275 | male full frontal nudity | 434996581 | 30 | 1.449989e+07 | 3005289.5 |
5839 | homosexual | 361636313 | 35 | 1.033247e+07 | 5526675.0 |
6154 | independent film | 166134909 | 33 | 5.034391e+06 | 1467396.0 |
306 rows × 5 columns
count50Keywords.sort_values(by='Mean', ascending=False)[0:30].plot.bar(x='Keywords', y='Mean', color='green')
<AxesSubplot:xlabel='Keywords'>
superhero keywords is popular in the 2010s and have high average gross
Rank keywords
count50Keywords = count50Keywords.sort_values(by='Mean', ascending=True).reset_index(drop=True)
keywordRank = dict()
for i, row in enumerate(count50Keywords['Keywords']):
keywordRank[row] = i + 1 ## Plus 1 in order to release the 0 position for another film
keywordRank
{'independent film': 1, 'homosexual': 2, 'male full frontal nudity': 3, 'topless female nudity': 4, 'one word title': 5, 'scene during opening credits': 6, 'timeframe 1990s': 7, 'lust': 8, 'title directed by female': 9, 'female frontal nudity': 10, 'three word title': 11, 'infidelity': 12, 'written by director': 13, 'male frontal nudity': 14, 'student': 15, 'african american': 16, 'bare breasts': 17, 'lesbian': 18, 'hood': 19, 'bar': 20, 'actor': 21, 'breasts': 22, 'two word title': 23, 'vietnam war veteran': 24, 'restaurant': 25, 'secret': 26, 'gay': 27, 'watching tv': 28, 'coming of age': 29, 'slasher': 30, 'obsession': 31, 'widow': 32, '1950s': 33, 'period drama': 34, 'public nudity': 35, 'panties': 36, 'satire': 37, 'woman on top': 38, 'jewish': 39, 'male rear nudity': 40, 'looking at oneself in a mirror': 41, 'money': 42, 'f rated': 43, 'surrealism': 44, 'aerial camera shot': 45, 'lesbian kiss': 46, 'character name as title': 47, 'teacher': 48, 'female full frontal nudity': 49, 'bully': 50, 'reporter': 51, '1930s': 52, 'coach': 53, 'waitress': 54, 'teenage boy': 55, 'love': 56, 'movie flop': 57, 'fbi federal bureau of investigation': 58, 'sex': 59, 'italy': 60, '1960s': 61, 'apartment': 62, 'marriage': 63, 'police officer': 64, 'male nudity': 65, 'bare chested male': 66, 'four word title': 67, 'adultery': 68, 'girl': 69, 'widower': 70, 'vietnam': 71, 'car accident': 72, 'friend': 73, 'doctor': 74, 'boy': 75, 'pubic hair': 76, 'teenage girl': 77, 'boyfriend girlfriend relationship': 78, 'husband wife relationship': 79, 'nudity': 80, 'baseball': 81, 'neo noir': 82, 'rape': 83, 'black comedy': 84, 'gang': 85, 'hospital': 86, 'white panties': 87, 'voyeur': 88, 'character names as title': 89, 'small town': 90, 'concert': 91, 'college': 92, 'thief': 93, 'writer': 94, 'pregnancy': 95, '1920s': 96, 'priest': 97, 'sex comedy': 98, 'lingerie': 99, 'hare krishna': 100, 'robbery': 101, 'cult film': 102, 'suicide': 103, 'femme fatale': 104, 'church': 105, 'actress': 106, 'singer': 107, 'london england': 108, 'road trip': 109, 'seduction': 110, 'journalist': 111, 'bikini': 112, '1940s': 113, 'texas': 114, 'family relationships': 115, 'female rear nudity': 116, 'female nudity': 117, 'dance': 118, 'gangster': 119, 'man wears eyeglasses': 120, 'sex scene': 121, 'automobile': 122, 'brother brother relationship': 123, 'lawyer': 124, 'extramarital affair': 125, 'prison': 126, 'murder': 127, 'airport': 128, 'nun': 129, 'grief': 130, 'criminal': 131, 'serial killer': 132, 'cancer': 133, 'psychotronic film': 134, 'neighbor': 135, 'columbia tristar': 136, 'slimehouse': 137, 'spoof': 138, 'male objectification': 139, 'best friend': 140, 'school': 141, 'military': 142, 'based on real person': 143, 'voyeurism': 144, 'detective': 145, 'slapstick comedy': 146, 'dysfunctional family': 147, 'love triangle': 148, 'basketball': 149, 'private detective': 150, 'nightclub': 151, 'farce': 152, 'flashback': 153, 'high school': 154, 'teenager': 155, 'vietnam war': 156, 'politics': 157, 'nightmare': 158, 'hitman': 159, 'pantyhose': 160, 'mafia': 161, 'jealousy': 162, 'drugs': 163, 'blonde': 164, 'dancing': 165, 'village': 166, 'neo screwball comedy': 167, 'cleavage': 168, 'scantily clad female': 169, 'racism': 170, 'mother daughter relationship': 171, 'based on true story': 172, 'psychopath': 173, 'dream': 174, 'nazi': 175, 'united states': 176, 'new york city': 177, '1970s': 178, 'undercover': 179, 'prostitute': 180, 'investigation': 181, 'mother son relationship': 182, 'boxing': 183, 'road movie': 184, 'farm': 185, 'musician': 186, 'competition': 187, 'blood': 188, 'train': 189, 'usa': 190, 'psychiatrist': 191, 'los angeles california': 192, 'horse': 193, 'new york': 194, 'stripper': 195, '1980s': 196, 'california': 197, 'singing in a car': 198, 'baby': 199, 'nurse': 200, 'police': 201, 'faith': 202, 'on the road': 203, 'torture': 204, 'supernatural horror': 205, 'bound and gagged': 206, 'father son relationship': 207, 'disney': 208, 'vomiting': 209, 'female protagonist': 210, 'kiss': 211, 'single mother': 212, 'hostage': 213, 'england': 214, 'car': 215, 'violence': 216, 'revenge': 217, 'beach': 218, 'parody': 219, 'woman in jeopardy': 220, 'india': 221, 'paris france': 222, 'brother sister relationship': 223, 'friendship': 224, 'funeral': 225, 'strong female character': 226, 'christmas': 227, 'on the run': 228, 'party': 229, '2010s': 230, 'desert': 231, 'divorce': 232, 'fish out of water': 233, 'japan': 234, 'dog': 235, 'death': 236, 'scientist': 237, '1990s': 238, 'father daughter relationship': 239, 'no opening credits': 240, 'chase': 241, 'escape': 242, 'male protagonist': 243, 'memory': 244, 'fight': 245, 'singing': 246, 'mexico': 247, 'soldier': 248, 'snow': 249, 'sister sister relationship': 250, 'ghost': 251, 'time bomb': 252, 'wedding': 253, 'hotel': 254, 'zombie': 255, 'france': 256, 'heist': 257, 'robot': 258, 'gun': 259, 'deception': 260, '2000s': 261, 'cat': 262, 'based on novel': 263, 'martial arts': 264, 'demon': 265, 'kidnapping': 266, 'race against time': 267, 'teen movie': 268, 'conspiracy': 269, 'survival': 270, 'rescue': 271, 'china': 272, 'world war two': 273, 'orphan': 274, 'assassin': 275, 'vampire': 276, 'witch': 277, '3 dimensional': 278, 'island': 279, 'astronaut': 280, 'outer space': 281, 'hero': 282, 'terrorist': 283, 'good versus evil': 284, 'dystopia': 285, 'supernatural power': 286, 'remake': 287, 'second part': 288, 'monster': 289, 'spy': 290, 'africa': 291, 'surprise ending': 292, 'battle': 293, 'post apocalypse': 294, 'alien': 295, 'betrayal': 296, 'villain': 297, 'jungle': 298, 'time travel': 299, 'future': 300, 'sequel': 301, 'king': 302, 'princess': 303, 'magic': 304, 'based on comic book': 305, 'superhero': 306}
Get rank for keywords by maximum
Default max rank = 0
import random
def getKeywordsRank(keywords):
max = 0
for keyword in keywords:
if keyword not in keywordRank.keys():
continue
if keywordRank[keyword] > max:
max = keywordRank[keyword]
if max == 0:
max = random.randint(1, 100)
return max
data2 = data.copy()
data2['keywordRank'] = data2['Keywords'].apply(getKeywordsRank)
corr = pearsonr(data2['keywordRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between keywordRank and Gross: %.3f' % corr)
data2.plot.scatter(x='keywordRank', y='Gross_worldwide', color='blue')
Pearsons correlation between keywordRank and Gross: 0.167
<AxesSubplot:xlabel='keywordRank', ylabel='Gross_worldwide'>
import random
def getKeywordsRank(keywords):
total = 0
length = len(keywords)
if length == 0:
length = 1
for keyword in keywords:
if keyword not in keywordRank.keys():
total += random.randint(1, 100)
continue
total += keywordRank[keyword]
return total / length
data2 = data.copy()
data2['keywordRank'] = data2['Keywords'].apply(getKeywordsRank)
corr = pearsonr(data2['keywordRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between keywordRank and Gross: %.3f' % corr)
data2.plot.scatter(x='keywordRank', y='Gross_worldwide', color='blue')
Pearsons correlation between keywordRank and Gross: 0.162
<AxesSubplot:xlabel='keywordRank', ylabel='Gross_worldwide'>
final['keywordRank'] = data2['keywordRank']
How distribution of gross by MPAA Certificate
certificate = parseWithMoneyAndCount(data, 'ListOfCertificate')
fig = plt.figure(figsize=(8, 4))
data2 = certificate.sort_values(by='Count', ascending=False)
plt.bar(data=data2, x='ListOfCertificate', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=15)
plt.ylabel("Count", fontsize=20)
plt.title("Releases By Certificate", fontsize=20)
plt.show()
fig = plt.figure(figsize=(8, 4))
data2 = certificate.sort_values(by='Median', ascending=False)
plt.bar(data=data2, x='ListOfCertificate', height='Median', color="salmon")
plt.xticks(rotation=90, fontsize=15)
plt.ylabel("Median Gross", fontsize=20)
plt.title("Gross By Certificate", fontsize=20)
plt.show()
fig = plt.figure(figsize=(8, 4))
data2 = certificate.sort_values(by='Mean', ascending=False)
plt.bar(data=data2, x='ListOfCertificate', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=15)
plt.ylabel("Mean Gross", fontsize=20)
plt.title("Gross By Certificate", fontsize=20)
plt.show()
NC-17 has the least numbers of release
PG-13 and G has higher mean and medians gross R and NC-27 has low mean and median gross This is understandable since R and NC-27 will reduces the age can watch the film. Hence lower!
certificate
ListOfCertificate | Total | Count | Mean | Median | |
---|---|---|---|---|---|
0 | G | 42824185750 | 937 | 4.570351e+07 | 6780490.0 |
1 | NC-17 | 925845011 | 37 | 2.502284e+07 | 7412216.0 |
2 | PG | 144055409565 | 1621 | 8.886824e+07 | 23237911.0 |
3 | PG-13 | 283553468080 | 2466 | 1.149852e+08 | 34714400.0 |
4 | R | 170006305715 | 3867 | 4.396336e+07 | 14000000.0 |
Conduct rank by Certificate
By Mean
certificate = certificate.sort_values(by='Mean', ascending=True).reset_index(drop=True)
cerRank = dict()
for i, row in enumerate(certificate['ListOfCertificate']):
cerRank[row] = i + 1
cerRank
{'NC-17': 1, 'R': 2, 'G': 3, 'PG': 4, 'PG-13': 5}
def getCerRank(cers):
max = 0
for cer in cers:
if cer not in cerRank.keys():
continue
if cerRank[cer] > max:
max = cerRank[cer]
return max
data2 = data.copy()
data2['cerRank'] = data2['ListOfCertificate'].apply(getCerRank)
corr = pearsonr(data2['cerRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between cerRank and Gross: %.3f' % corr)
data2.plot.scatter(x='cerRank', y='Gross_worldwide', color='blue')
Pearsons correlation between cerRank and Gross: 0.199
<AxesSubplot:xlabel='cerRank', ylabel='Gross_worldwide'>
final['cerRank'] = data2['cerRank']
See that PG-13 have the highest mean gross.
We will test that is movie have rated PG-13 will have more gross than others.
data2 = data.copy()
data2['PG-13'] = data2['ListOfCertificate'].apply(lambda x: 1 if 'PG-13' in x else 0)
corr = pearsonr(data2['PG-13'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between PG-13 and Gross: %.3f' % corr)
data2.plot.scatter(x='PG-13', y='Gross_worldwide', color='blue')
Pearsons correlation between PG-13 and Gross: 0.173
<AxesSubplot:xlabel='PG-13', ylabel='Gross_worldwide'>
final.to_csv('../dataset/extracted/feature_extracted.csv', index=False)