import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from scipy.stats import pearsonr
import random

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
data = pd.read_csv('../dataset/processed/cleaned_data.csv')
data.head()


import ast

cols = ['Cast', 'Genre', 'Studios', 'ListOfCertificate', 'Keywords', 'Languages', 'Countries', 'Crew']
for col in cols:
    data[col] = data[col].apply(ast.literal_eval)


def parseWithMoneyAndCount(dataframe, col_name):
    res = []
    count = []
    gross = []

    for index, record in enumerate(dataframe[col_name]):
        for x in record:
            # Save results to corresponding array
            res.append(x)
            gross.append(dataframe['Gross_worldwide'][index])
            count.append(1)

    # Make dataframe; remove duplicates and sum corresponding columns
    t1 = pd.DataFrame({col_name: res, 'Total': gross, 'Count': count})
    result1 = t1.groupby(col_name).sum()
    result1.reset_index(inplace=True)

    t2 = pd.DataFrame({col_name: res, 'Mean': gross})
    result2 = t2.groupby(col_name).mean()
    result2.reset_index(inplace=True)

    result = result1.merge(result2, on=col_name, how='inner')

    t3 = pd.DataFrame({col_name: res, 'Median': gross})
    result3 = t3.groupby(col_name).median()
    result3.reset_index(inplace=True)

    result = result.merge(result3, on=col_name, how='inner')
    return result


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8752 entries, 0 to 8751
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Movie_Title        8752 non-null   object 
 1   Movie_ID           8752 non-null   int64  
 2   Budget             8752 non-null   int64  
 3   Cast               8752 non-null   object 
 4   Crew               8752 non-null   object 
 5   Studios            8752 non-null   object 
 6   Genre              8752 non-null   object 
 7   Keywords           8752 non-null   object 
 8   Languages          8752 non-null   object 
 9   Countries          8752 non-null   object 
 10  Release_Data       8752 non-null   object 
 11  Runtime            8752 non-null   float64
 12  Gross_worldwide    8752 non-null   int64  
 13  Rating             8752 non-null   float64
 14  Rating_Count       8752 non-null   int64  
 15  ListOfCertificate  8752 non-null   object 
 16  Release_Year       8752 non-null   int64  
 17  Release_Month      8752 non-null   int64  
 18  Release_Day        8752 non-null   int64  
dtypes: float64(2), int64(7), object(10)
memory usage: 1.3+ MB


data.describe()


cols = ['Budget', 'Runtime', 'Release_Year', 'Gross_worldwide', 'Rating', 'Rating_Count', 'Release_Month']
info = data[cols]


sns.heatmap(info.corr(), annot=True)

<AxesSubplot:>


data.hist(bins=50, figsize=(20, 15))

array([[<AxesSubplot:title={'center':'Movie_ID'}>,
        <AxesSubplot:title={'center':'Budget'}>,
        <AxesSubplot:title={'center':'Runtime'}>],
       [<AxesSubplot:title={'center':'Gross_worldwide'}>,
        <AxesSubplot:title={'center':'Rating'}>,
        <AxesSubplot:title={'center':'Rating_Count'}>],
       [<AxesSubplot:title={'center':'Release_Year'}>,
        <AxesSubplot:title={'center':'Release_Month'}>,
        <AxesSubplot:title={'center':'Release_Day'}>]], dtype=object)


from pandas.plotting import scatter_matrix

scatter_matrix(info, figsize=(20, 12), hist_kwds={'bins': 50})

array([[<AxesSubplot:xlabel='Budget', ylabel='Budget'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Budget'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Budget'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Budget'>,
        <AxesSubplot:xlabel='Rating', ylabel='Budget'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Budget'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Budget'>],
       [<AxesSubplot:xlabel='Budget', ylabel='Runtime'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Runtime'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Runtime'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Runtime'>,
        <AxesSubplot:xlabel='Rating', ylabel='Runtime'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Runtime'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Runtime'>],
       [<AxesSubplot:xlabel='Budget', ylabel='Release_Year'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Release_Year'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Release_Year'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Release_Year'>,
        <AxesSubplot:xlabel='Rating', ylabel='Release_Year'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Release_Year'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Release_Year'>],
       [<AxesSubplot:xlabel='Budget', ylabel='Gross_worldwide'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Gross_worldwide'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Gross_worldwide'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Gross_worldwide'>,
        <AxesSubplot:xlabel='Rating', ylabel='Gross_worldwide'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Gross_worldwide'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Gross_worldwide'>],
       [<AxesSubplot:xlabel='Budget', ylabel='Rating'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Rating'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Rating'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Rating'>,
        <AxesSubplot:xlabel='Rating', ylabel='Rating'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Rating'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Rating'>],
       [<AxesSubplot:xlabel='Budget', ylabel='Rating_Count'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Rating_Count'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Rating_Count'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Rating_Count'>,
        <AxesSubplot:xlabel='Rating', ylabel='Rating_Count'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Rating_Count'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Rating_Count'>],
       [<AxesSubplot:xlabel='Budget', ylabel='Release_Month'>,
        <AxesSubplot:xlabel='Runtime', ylabel='Release_Month'>,
        <AxesSubplot:xlabel='Release_Year', ylabel='Release_Month'>,
        <AxesSubplot:xlabel='Gross_worldwide', ylabel='Release_Month'>,
        <AxesSubplot:xlabel='Rating', ylabel='Release_Month'>,
        <AxesSubplot:xlabel='Rating_Count', ylabel='Release_Month'>,
        <AxesSubplot:xlabel='Release_Month', ylabel='Release_Month'>]],
      dtype=object)


final = data.copy()


genre = parseWithMoneyAndCount(data, 'Genre')
genre


genre.sort_values(by='Count', ascending=False, inplace=True)
genre.plot.bar(x='Genre', y='Count')
genre.sort_values(by='Mean', ascending=False, inplace=True)
genre.plot.bar(x='Genre', y='Mean', color="salmon")
genre.sort_values(by='Median', ascending=False, inplace=True)
genre.plot.bar(x='Genre', y='Median', color="salmon")

<AxesSubplot:xlabel='Genre'>


genre.sort_values(by='Count', ascending=False)


genre = genre[genre['Count'] > 10]
genre.sort_values(by='Median', ascending=True, inplace=True)
genre.reset_index(drop=True, inplace=True)
genreRank = dict()
for i, row in enumerate(genre['Genre']):
    genreRank[row] = i + 1
genreRank

{'Film-Noir': 1,
 'Documentary': 2,
 'Music': 3,
 'History': 4,
 'Biography': 5,
 'Drama': 6,
 'Romance': 7,
 'Western': 8,
 'War': 9,
 'Sport': 10,
 'Crime': 11,
 'Comedy': 12,
 'Musical': 13,
 'Mystery': 14,
 'Horror': 15,
 'Thriller': 16,
 'Fantasy': 17,
 'Action': 18,
 'Family': 19,
 'Sci-Fi': 20,
 'Adventure': 21,
 'Animation': 22}


def getRank(listGenre):
    max = -1
    for genre in listGenre:
        if genre not in genreRank.keys():
            continue
        if genreRank[genre] > max:
            max = genreRank[genre]
    if max != -1:
        return max
    return 0


data2 = data.copy()
data2['GenreRank'] = data2['Genre'].apply(getRank)
corr = pearsonr(data2['GenreRank'], data2['Gross_worldwide'])[0]
print('Pearsonr correlation between GenreRank and Gross: %.3f' % corr)
data2.plot.scatter(x='GenreRank', y='Gross_worldwide', color='blue')

Pearsonr correlation between GenreRank and Gross: 0.253

<AxesSubplot:xlabel='GenreRank', ylabel='Gross_worldwide'>


def getRank(listGenre):
    max = -1
    for genre in listGenre:
        if genre not in genreRank.keys():
            continue
        max += genreRank[genre]
    if max != -1:
        return max
    return 0


data2 = data.copy()
data2['GenreRank'] = data2['Genre'].apply(getRank)
corr = pearsonr(data2['GenreRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between GenreRank and Gross: %.3f' % corr)
data2.plot.scatter(x='GenreRank', y='Gross_worldwide', color='royalblue')

Pearsons correlation between GenreRank and Gross: 0.316

<AxesSubplot:xlabel='GenreRank', ylabel='Gross_worldwide'>


def getRank(listGenre):
    max = 0
    for genre in listGenre:
        if genre not in genreRank.keys():
            continue
        max += genreRank[genre]
    if max == 0:
        return random.randint(1, 15)
    return max / len(listGenre)


data2 = data.copy()
data2['GenreRank'] = data2['Genre'].apply(getRank)
corr = pearsonr(data2['GenreRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between GenreRank and Gross: %.3f' % corr)
data2.plot.scatter(x='GenreRank', y='Gross_worldwide', color='royalblue')

Pearsons correlation between GenreRank and Gross: 0.325

<AxesSubplot:xlabel='GenreRank', ylabel='Gross_worldwide'>


final['GenreRank'] = data2['GenreRank']


data2 = data.copy()
data2['IsAdventure'] = data2['Genre'].apply(lambda x: 1 if 'Adventure' in x else 0)
corr = pearsonr(data2['IsAdventure'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between IsAdventure and Gross: %.3f' % corr)
data2.plot.scatter(x='IsAdventure', y='Gross_worldwide', color='royalblue')

Pearsons correlation between IsAdventure and Gross: 0.366

<AxesSubplot:xlabel='IsAdventure', ylabel='Gross_worldwide'>


final['IsAdventure'] = data2['IsAdventure']


data2 = data.copy()
data2['Release_Data'] = pd.to_datetime(data2['Release_Data'], format='%Y-%m-%d')
data2 = data2.assign(WeekDay=data2['Release_Data'].dt.weekday + 2)
data2['WeekDay'].value_counts()

6    6790
4    1004
5     624
7     131
3      89
2      60
8      54
Name: WeekDay, dtype: int64


data['Release_Day'].value_counts().reset_index().plot.bar(x='index', y='Release_Day')

<AxesSubplot:xlabel='index'>


data2['WeekDay'].value_counts().reset_index().sort_values(by='index').plot.bar(x='index', y='WeekDay')

<AxesSubplot:xlabel='index'>


data2.plot.scatter(x='WeekDay', y='Gross_worldwide', color='brown')

<AxesSubplot:xlabel='WeekDay', ylabel='Gross_worldwide'>


corr = pearsonr(data2['WeekDay'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between WeekDay and Gross: %.3f' % corr)
data2.plot.scatter(x='WeekDay', y='Gross_worldwide', color='blue')

Pearsons correlation between WeekDay and Gross: 0.003

<AxesSubplot:xlabel='WeekDay', ylabel='Gross_worldwide'>


data2['IsFriWed'] = data2['WeekDay'].apply(lambda x: 1 if x in [4, 6] else 0)
corr = pearsonr(data2['IsFriWed'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between IsFriWed and Gross: %.3f' % corr)
data2.plot.scatter(x='IsFriWed', y='Gross_worldwide', color='blue')

Pearsons correlation between IsFriWed and Gross: 0.047

<AxesSubplot:xlabel='IsFriWed', ylabel='Gross_worldwide'>


month = data['Release_Month'].value_counts().reset_index().sort_values(by='index', ascending=True)
month = month.rename(columns={'index': 'Month', 'Release_Month': 'ReleaseCount'})
month.plot.bar(x='Month', y='ReleaseCount', color='green')

<AxesSubplot:xlabel='Month'>


cols = ['Release_Month', 'Gross_worldwide']

data.plot.scatter(x='Release_Month', y='Gross_worldwide', color='green')
plt.title("Scatter plot for Release_Month and Gross")
#3
gColor = [6, 7, 12]
rColor = [1, 2, 3, 4, 5, 8, 9, 10, 11]
colorMap = dict()
for i in gColor:
    colorMap[i] = 'tab:green'
for i in rColor:
    colorMap[i] = 'tab:red'
C = colorMap
fig = plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Release_Year', y='Gross_worldwide', hue='Release_Month', palette=C)
plt.title("How gross distributed by Month and Year")
plt.show()


corr = pearsonr(data2['Release_Year'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between Release_Year and Gross: %.3f' % corr)

Pearsons correlation between Release_Year and Gross: 0.209


cols = ['Release_Month', 'Gross_worldwide']

month = data[cols]
month = month.groupby("Release_Month").median().reset_index()
#
month.plot.bar(x='Release_Month', y='Gross_worldwide')
plt.title("Median by Month")

Text(0.5, 1.0, 'Median by Month')


def getSpecialMonth(month):
    specialMonth = [6, 7, 12]
    if month in specialMonth:
        return 1
    else:
        return 0


data2 = data.copy()
data2['SpecialMonth'] = data2['Release_Month'].apply(getSpecialMonth)
corr = pearsonr(data2['SpecialMonth'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between SpecialMonth and Gross: %.3f' % corr)
data2.plot.scatter(x='SpecialMonth', y='Gross_worldwide', color='blue')

Pearsons correlation between SpecialMonth and Gross: 0.120

<AxesSubplot:xlabel='SpecialMonth', ylabel='Gross_worldwide'>


month = data[cols]
month = month.groupby("Release_Month").mean().reset_index()
#1
month.plot.bar(x='Release_Month', y='Gross_worldwide')
plt.title("Average by Month")

Text(0.5, 1.0, 'Average by Month')


def getSpecialMonth(month):
    specialMonth = [5, 6, 7, 11, 12]
    if month in specialMonth:
        return 1
    else:
        return 0


data2 = data.copy()
data2['SpecialMonth'] = data2['Release_Month'].apply(getSpecialMonth)
corr = pearsonr(data2['SpecialMonth'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between SpecialMonth and Gross: %.3f' % corr)
data2.plot.scatter(x='SpecialMonth', y='Gross_worldwide', color='blue')

Pearsons correlation between SpecialMonth and Gross: 0.142

<AxesSubplot:xlabel='SpecialMonth', ylabel='Gross_worldwide'>


final['SpecialMonth'] = data2['SpecialMonth']


data['Gross_worldwide'].describe()

count    8.752000e+03
mean     7.172680e+07
std      1.564077e+08
min      9.500000e+01
25%      4.443069e+06
50%      1.821152e+07
75%      6.560984e+07
max      2.847246e+09
Name: Gross_worldwide, dtype: float64


data['Budget'].describe()

count    8.752000e+03
mean     2.351619e+07
std      3.713275e+07
min      2.200000e+02
25%      2.200000e+02
50%      1.000000e+07
75%      3.000000e+07
max      3.560000e+08
Name: Budget, dtype: float64


sns.lmplot(data=data, x='Budget', y='Gross_worldwide')

<seaborn.axisgrid.FacetGrid at 0x2843b774760>


corr = pearsonr(data['Budget'], data['Gross_worldwide'])[0]
print('Pearsons correlation between Budget and Gross: %.3f' % corr)

Pearsons correlation between Budget and Gross: 0.741


data2 = data.copy()
data2['numCast'] = data2['Cast'].apply(lambda x: len(x))
corr = pearsonr(data2['numCast'], data2['Gross_worldwide'])[0]
data2['numCast'].value_counts()

18    8203
16      66
17      64
15      62
14      53
13      48
12      45
10      35
11      35
9       26
8       23
7       17
6       17
5       12
1       11
4       11
3       11
2        8
0        5
Name: numCast, dtype: int64


corr = pearsonr(data2['numCast'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between numCast and Gross: %.3f' % corr)
data2.plot.scatter(x='numCast', y='Gross_worldwide', color='brown')

Pearsons correlation between numCast and Gross: 0.059

<AxesSubplot:xlabel='numCast', ylabel='Gross_worldwide'>


cast = parseWithMoneyAndCount(data, 'Cast')
cast.sort_values(by='Count', ascending=False, inplace=True)
cast


fig = plt.figure(figsize=(8, 6))
# plt.subplot(2,1,1)
data2 = cast.sort_values(by='Count', ascending=False)[0:15]
plt.bar(data=data2, x='Cast', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Number movies", fontsize=20)
plt.title("Cast and Number of Movies they cast for", fontsize=20)

Text(0.5, 1.0, 'Cast and Number of Movies they cast for')


cast3Movies = cast[cast['Count'] < 3]
cast3Movies.sort_values(by='Median', ascending=False, inplace=True)
cast3Movies


fig = plt.figure(figsize=(8, 6))
# plt.subplot(2,1,1)
data2 = cast.sort_values(by='Total', ascending=False)[0:20]
plt.bar(data=data2, x='Cast', height='Total', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Total Gross", fontsize=15)
plt.title("Cast and Total Gross of Movies they cast for", fontsize=15)
fig = plt.figure(figsize=(8, 6))
# plt.subplot(2,1,2)
data3 = cast.sort_values(by='Mean', ascending=False)[0:20]
plt.bar(data=data3, x='Cast', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Cast and Average Gross of Movies they cast for", fontsize=15)
plt.xlabel("Cast", fontsize=15)
plt.ylabel("Average Gross", fontsize=15)

fig = plt.figure(figsize=(8, 6))
data3 = cast.sort_values(by='Median', ascending=False)[0:20]
plt.bar(data=data3, x='Cast', height='Median', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Cast and Median Gross of Movies they cast for", fontsize=15)
plt.xlabel("Cast", fontsize=15)
plt.ylabel("Median Gross", fontsize=15)
plt.show()


cast = parseWithMoneyAndCount(data, 'Cast')
cast.sort_values(by='Count', ascending=False, inplace=True)
cast10Movies = cast[cast['Count'] > 5]
cast10Movies.sort_values(by='Mean', ascending=False, inplace=True)
cast10Movies.reset_index(drop=True, inplace=True)
cast10Movies


plt.figure(figsize=(8, 6))
data3 = cast10Movies.sort_values(by='Mean', ascending=False)[0:25]
plt.bar(data=data3, x='Cast', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Cast and Average Gross of Movies they cast for", fontsize=15)
plt.xlabel("Cast", fontsize=15)
plt.ylabel("Average Gross", fontsize=15)
plt.show()


cast10Movies.sort_values(by='Mean', ascending=True, inplace=True)
castRank = dict()
for i, row in enumerate(cast10Movies['Cast']):
    castRank[row] = i + 1
castRank

{'Arsinée Khanjian': 1,
 'Dick Cavett': 2,
 'Tony Curtis': 3,
 'Rita Taggart': 4,
 'Deborah Kerr': 5,
 'John Lennon': 6,
 'Victoria Abril': 7,
 'Jill Schoelen': 8,
 'Alison Steadman': 9,
 'Jacques Mathou': 10,
 'Lior Ashkenazi': 11,
 'Ramon Bieri': 12,
 'Mink Stole': 13,
 'Traci Lind': 14,
 'Emilio Fernández': 15,
 'Jim Metzler': 16,
 'Kathryn Grody': 17,
 'Cheryl Ladd': 18,
 'Patrick Huard': 19,
 'John Cullum': 20,
 'Ricki Lake': 21,
 'Jack Rader': 22,
 'Blu Mankuma': 23,
 'Peter Eyre': 24,
 'Gore Vidal': 25,
 'Don McKellar': 26,
 'Suniel Shetty': 27,
 'Matt Keeslar': 28,
 'Anne-Marie Johnson': 29,
 'Humphrey Bogart': 30,
 'Luke Askew': 31,
 'Marcello Mastroianni': 32,
 'Richard Widmark': 33,
 'John Savident': 34,
 'Vrajesh Hirjee': 35,
 'Millie Perkins': 36,
 'Tim Barlow': 37,
 'Anna Massey': 38,
 'Jodie Markell': 39,
 'Amy Locane': 40,
 'Louise Latham': 41,
 'Gordon Pinsent': 42,
 'Jennifer Edwards': 43,
 'Ingrid Bergman': 44,
 'Debbie Harry': 45,
 'Woody Strode': 46,
 'Sandy Baron': 47,
 'Louise Lasser': 48,
 'Tom Burke': 49,
 'John Considine': 50,
 'Niall Buggy': 51,
 'Al White': 52,
 'Kerry Fox': 53,
 'Albert Delpy': 54,
 'Orson Welles': 55,
 'George Dickerson': 56,
 'Ken Wahl': 57,
 'Nastassja Kinski': 58,
 'John Sayles': 59,
 'Sherilyn Fenn': 60,
 'Perry Lang': 61,
 'Raghuvir Yadav': 62,
 'Eric Payne': 63,
 'Rémy Girard': 64,
 'Dhritiman Chatterjee': 65,
 'Danton Stone': 66,
 'Alan Bates': 67,
 'John F. Kennedy': 68,
 'Laura Morante': 69,
 'Gabriele Ferzetti': 70,
 'Isabelle Adjani': 71,
 'Robert Townsend': 72,
 'Teddy Wilson': 73,
 'Alberta Watson': 74,
 'Ratna Pathak Shah': 75,
 'Steve James': 76,
 'Tristram Jellinek': 77,
 'Perry Lopez': 78,
 'Ian Wolfe': 79,
 'Kirk Douglas': 80,
 'William Russ': 81,
 'Brooke Adams': 82,
 'Michael Hordern': 83,
 'Kieu Chinh': 84,
 'Tony Frank': 85,
 'John Lynch': 86,
 'Antonia Rey': 87,
 'Camille Saviola': 88,
 'Sonakshi Sinha': 89,
 'Harvey Atkin': 90,
 'Gilda Radner': 91,
 'Wings Hauser': 92,
 'Murli Sharma': 93,
 'Prem Chopra': 94,
 'Mukesh Tiwari': 95,
 'Jamie Tirelli': 96,
 'Barack Obama': 97,
 'Anne Pitoniak': 98,
 'Ticky Holgado': 99,
 'Divya Dutta': 100,
 'Lesley-Anne Down': 101,
 'Vipin Sharma': 102,
 'Molly Parker': 103,
 'Lysette Anthony': 104,
 'Puneet Issar': 105,
 'Brijendra Kala': 106,
 'Charles Bronson': 107,
 'John P. Ryan': 108,
 'Charles Cioffi': 109,
 'David Dukes': 110,
 'Leonard L. Thomas': 111,
 'Leonard Termo': 112,
 'Brady Corbet': 113,
 'Wanda De Jesus': 114,
 'David Dwyer': 115,
 'Tom Hickey': 116,
 'Lonny Chapman': 117,
 'Kulbhushan Kharbanda': 118,
 'Miou-Miou': 119,
 'Neil Young': 120,
 'Michael Gross': 121,
 'Gene Kelly': 122,
 'Don Francks': 123,
 'Kirby Heyborne': 124,
 'Steve Railsback': 125,
 'Philip Akin': 126,
 'Richard Brooks': 127,
 'Manoj Pahwa': 128,
 'Ángela Molina': 129,
 'Melonie Diaz': 130,
 'Arye Gross': 131,
 'William McNamara': 132,
 'Rani Mukerji': 133,
 'Vincent Spano': 134,
 'John Ventimiglia': 135,
 'Tinnu Anand': 136,
 'Manoj Bajpayee': 137,
 'James Luisi': 138,
 'Vidya Balan': 139,
 'Lesley Ann Warren': 140,
 'Peter Cook': 141,
 'Farnesio de Bernal': 142,
 'Ray Sharkey': 143,
 'Ray McAnally': 144,
 'Lawrence Tierney': 145,
 'Troy Byer': 146,
 'Ron Leibman': 147,
 'Nigel Terry': 148,
 'Richard Jaeckel': 149,
 'Bill Paterson': 150,
 'Daniel Baldwin': 151,
 'Jeanine Jackson': 152,
 'Mariel Hemingway': 153,
 'Robert Trebor': 154,
 'Victoria Jackson': 155,
 'Billy Green Bush': 156,
 'Meg Foster': 157,
 'Billy Jayne': 158,
 'Saif Ali Khan': 159,
 'Christopher Fulford': 160,
 'Richard Bradford': 161,
 'Rachel Ward': 162,
 'Eleanor Bron': 163,
 'Preity Zinta': 164,
 'Sam Waterston': 165,
 'Henry Silva': 166,
 'Wayne Robson': 167,
 'Matthew Faison': 168,
 'Jean-Marc Barr': 169,
 'David Hemblen': 170,
 'Terry Kiser': 171,
 'Timothy Jerome': 172,
 'David Johansen': 173,
 'Annie Golden': 174,
 'Daniel Auteuil': 175,
 'Sandra Bernhard': 176,
 'Johny Lever': 177,
 'Edie Falco': 178,
 'Molly Hagan': 179,
 'Tony Roberts': 180,
 'Kunal Kapoor': 181,
 'Liz Torres': 182,
 'Bill Clinton': 183,
 'Guy Boyd': 184,
 'Susan Tyrrell': 185,
 "John O'Leary": 186,
 'Rossy de Palma': 187,
 'Jack Nance': 188,
 'Ron White': 189,
 'Massimo Sarchielli': 190,
 'Michael Dudikoff': 191,
 'Pat Corley': 192,
 'Ernest Borgnine': 193,
 'Chris Haywood': 194,
 'Brent Hinkley': 195,
 'Masood Akhtar': 196,
 'Hrithik Roshan': 197,
 'Cybill Shepherd': 198,
 'Kirron Kher': 199,
 'Ione Skye': 200,
 "Raymond O'Connor": 201,
 'Alison Elliott': 202,
 'E. Katherine Kerr': 203,
 'Ashley Peldon': 204,
 'Michael DeLorenzo': 205,
 'Philippe Morier-Genoud': 206,
 'Dennis Letts': 207,
 'Herbert Lom': 208,
 'Marilu Henner': 209,
 'Vincent Kartheiser': 210,
 'Robert Stephens': 211,
 'Mackenzie Astin': 212,
 'Rutanya Alda': 213,
 'Tom Nolan': 214,
 'Lara Dutta': 215,
 'Kim Hunter': 216,
 'Darcy DeMoss': 217,
 'Charles Lane': 218,
 'Panchito Gómez': 219,
 'Polly Draper': 220,
 'Rudy De Luca': 221,
 'Beau Starr': 222,
 'Darren McGavin': 223,
 'Daniel Lapaine': 224,
 'Stuart Margolin': 225,
 'Santos Morales': 226,
 'Shabana Azmi': 227,
 'Chunky Panday': 228,
 'Rajpal Naurang Yadav': 229,
 "De'aundre Bonds": 230,
 'Susan Barnes': 231,
 'Vic Polizos': 232,
 'Tim Thomerson': 233,
 'John Roselius': 234,
 'Lillete Dubey': 235,
 'Gary Grubbs': 236,
 'Kane Hodder': 237,
 'Carrie Snodgress': 238,
 'Stephen Baldwin': 239,
 'Joe Regalbuto': 240,
 'David Harris': 241,
 'Victor Argo': 242,
 'H.B. Haggerty': 243,
 'Jordan Baker': 244,
 'Ajay Devgn': 245,
 'Dan Shor': 246,
 'Harley Cross': 247,
 'Tupac Shakur': 248,
 'Sid Haig': 249,
 'John Dennis Johnston': 250,
 'Joie Lee': 251,
 'Carlin Glynn': 252,
 'William Morgan Sheppard': 253,
 'Ellen David': 254,
 "Olivia d'Abo": 255,
 'Tuesday Weld': 256,
 'Timothy Bottoms': 257,
 'Meshach Taylor': 258,
 'Eddie Albert': 259,
 'Kristy McNichol': 260,
 'Alex Colon': 261,
 'Kareena Kapoor': 262,
 'Ruth Sheen': 263,
 'Isabelle Huppert': 264,
 'Michael Schoeffling': 265,
 'Tim Daly': 266,
 'Sachin Khedekar': 267,
 'Tim Woodward': 268,
 'Mark Acheson': 269,
 'Jonathan Silverman': 270,
 'Neil Crone': 271,
 'R.G. Armstrong': 272,
 'Al Waxman': 273,
 'Jack Wallace': 274,
 'Barbara Sukowa': 275,
 'Tico Wells': 276,
 'Kirk Cameron': 277,
 'Thierry Lhermitte': 278,
 'Joanne Baron': 279,
 'Jason Priestley': 280,
 'John Abraham': 281,
 'Beah Richards': 282,
 'Jean-Claude Dreyfus': 283,
 'John Schneider': 284,
 'Tonya Pinkins': 285,
 'Meg Tilly': 286,
 'John Tormey': 287,
 'Catherine Deneuve': 288,
 'Katrin Cartlidge': 289,
 'Daniel Giménez Cacho': 290,
 'Matthew Laurance': 291,
 'Louis Giambalvo': 292,
 'Miles Chapin': 293,
 'Craig Sheffer': 294,
 "Tom O'Brien": 295,
 'Stoney Jackson': 296,
 'Paul Shenar': 297,
 'Lee Grant': 298,
 'Sheeba Chaddha': 299,
 'Louis Mustillo': 300,
 'Julian Sands': 301,
 'Anne-Marie Duff': 302,
 'Erin Darke': 303,
 'Akshay Kumar': 304,
 'Jim Moody': 305,
 'Anthony Holland': 306,
 'Faye Grant': 307,
 'Clayton Landey': 308,
 'Peter Ustinov': 309,
 'Nicholas Campbell': 310,
 'Paul Bartel': 311,
 'Alan Fudge': 312,
 'Pankaj Tripathi': 313,
 'Mithun Chakraborty': 314,
 'Natalija Nogulich': 315,
 'Richard Benjamin': 316,
 'Bruce Payne': 317,
 'Joe Seneca': 318,
 'Theodore Bikel': 319,
 'Richard Romanus': 320,
 'Gerrit Graham': 321,
 'Ken Campbell': 322,
 'Nadim Sawalha': 323,
 'William Newman': 324,
 'Victoria Tennant': 325,
 'Suresh Menon': 326,
 'Iggy Pop': 327,
 'John LaMotta': 328,
 'Clu Gulager': 329,
 'Kenneth McMillan': 330,
 'Bernie McInerney': 331,
 'Ted McGinley': 332,
 'Ben Johnson': 333,
 'Mira Sorvino': 334,
 'Chloë Sevigny': 335,
 'Julie Delpy': 336,
 'John Barrett': 337,
 'Gerry Black': 338,
 'James Cada': 339,
 'Vlasta Vrana': 340,
 'Harry Northup': 341,
 'Keenan Wynn': 342,
 'Mia Farrow': 343,
 'Ellis Williams': 344,
 'Dennis Dun': 345,
 'Phil Fondacaro': 346,
 'Sylvia Miles': 347,
 'Alia Bhatt': 348,
 'Mary Woronov': 349,
 'Riteish Deshmukh': 350,
 'Lawrence Dane': 351,
 'Peter Arne': 352,
 'Scott Coffey': 353,
 'Michel Blanc': 354,
 'David Bowie': 355,
 'Jason London': 356,
 'Joseph Ragno': 357,
 'Corbin Bernsen': 358,
 'Chris Spencer': 359,
 'Joanne Whalley': 360,
 'Vincent Perez': 361,
 'Fabrice Luchini': 362,
 'Armin Shimerman': 363,
 'Lee de Broux': 364,
 'James Stewart': 365,
 'Angélica Aragón': 366,
 'Shah Rukh Khan': 367,
 'Govardhan Asrani': 368,
 'Bijou Phillips': 369,
 'Moses Gunn': 370,
 'Corey Haim': 371,
 'Annie McEnroe': 372,
 'Cliff De Young': 373,
 'Ken Pogue': 374,
 'André Maranne': 375,
 'Mohd. Zeeshan Ayyub': 376,
 'Michael Cassidy': 377,
 'Satish Shah': 378,
 'Rachael Leigh Cook': 379,
 'Bill Buell': 380,
 'Rishi Kapoor': 381,
 'Jessica Lundy': 382,
 'Jeffrey Combs': 383,
 'Sal Lopez': 384,
 'Lewis Arquette': 385,
 'Marceline Hugot': 386,
 'Peter Vaughan': 387,
 'Paul Butler': 388,
 'Cliff Gorman': 389,
 'Michelle Meyrink': 390,
 'Jackie Shroff': 391,
 'Cooper Huckabee': 392,
 'Bipasha Basu': 393,
 'Lorraine Bracco': 394,
 'Keenen Ivory Wayans': 395,
 'Julie Payne': 396,
 'Donald Pleasence': 397,
 'Fernando Rey': 398,
 'Richard Belzer': 399,
 'Kim Wayans': 400,
 'Shelley Winters': 401,
 'Michael Tucker': 402,
 'David de Keyser': 403,
 'Pawan Malhotra': 404,
 'Jacqueline Bisset': 405,
 'Royal Dano': 406,
 'Luke Edwards': 407,
 'José Ferrer': 408,
 'Dov Tiefenbach': 409,
 'Mary Stuart Masterson': 410,
 'Bill Sage': 411,
 'Molly Ringwald': 412,
 'Justin Edwards': 413,
 'Arshad Warsi': 414,
 'Anthony Johnson': 415,
 'Malaika Arora': 416,
 'Frederic Forrest': 417,
 'Toshirô Mifune': 418,
 'Bert Remsen': 419,
 'Arun Bali': 420,
 'Laura Ramsey': 421,
 'Sarah Polley': 422,
 'Julius Harris': 423,
 'Michael Greene': 424,
 'Charles McKeown': 425,
 'Steven Weber': 426,
 'Steve Antin': 427,
 'Michael McKean': 428,
 'Peter Sellers': 429,
 'Ned Bellamy': 430,
 'Peter Kwong': 431,
 'Abhishek Bachchan': 432,
 'John Vernon': 433,
 'Chuck Cooper': 434,
 'Earl Billings': 435,
 'Fanny Ardant': 436,
 'Gene Davis': 437,
 'Michael Paré': 438,
 'Thomas Hill': 439,
 'Michael J. Reynolds': 440,
 'Andre Gregory': 441,
 'Daphne Zuniga': 442,
 'Henry Fonda': 443,
 'Darlanne Fluegel': 444,
 'Trini Alvarado': 445,
 'Christine Lahti': 446,
 'Sally Kellerman': 447,
 'Jodi Long': 448,
 'Louis Guss': 449,
 'Mickey Jones': 450,
 'Christopher Malcolm': 451,
 'Arjun Rampal': 452,
 'Albert Salmi': 453,
 'Ice-T': 454,
 'Arlen Dean Snyder': 455,
 'Pauline Collins': 456,
 "Milo O'Shea": 457,
 'Tom Wright': 458,
 'Danny Aiello': 459,
 'Juhi Chawla': 460,
 'George Burns': 461,
 'Newell Alexander': 462,
 'Phil Davis': 463,
 'Sean McCann': 464,
 'T.K. Carter': 465,
 'Naomi Campbell': 466,
 'Shawn Hatosy': 467,
 'Piper Laurie': 468,
 'Judd Nelson': 469,
 'Jane Hallaren': 470,
 'Steve Forrest': 471,
 'Esai Morales': 472,
 'Louise Fletcher': 473,
 'Maria Conchita Alonso': 474,
 'Mia Kirshner': 475,
 'Sônia Braga': 476,
 'Matthew Cowles': 477,
 'Joe Unger': 478,
 'Dagmara Dominczyk': 479,
 'Allan Arbus': 480,
 'RuPaul': 481,
 'Lee Ving': 482,
 'Sy Richardson': 483,
 'Michael V. Gazzo': 484,
 'Sammy Davis Jr.': 485,
 'Maggie McCarthy': 486,
 'Jenny Wright': 487,
 'Ann Wedgeworth': 488,
 'John Carradine': 489,
 'Warren Clarke': 490,
 'Carmine Caridi': 491,
 'Jon Polito': 492,
 'James N. Harrell': 493,
 'Kathleen Wilhoite': 494,
 'Manoj Joshi': 495,
 'Bill Moseley': 496,
 'Suzanne Shepherd': 497,
 'George R. Robertson': 498,
 'Geraldine Page': 499,
 'Zakir Hussain': 500,
 'Keith Carradine': 501,
 'Robin Thomas': 502,
 'Mark Webber': 503,
 'Jack Elam': 504,
 'Ari Graynor': 505,
 'Severn Darden': 506,
 'Janet MacLachlan': 507,
 'Tony DiBenedetto': 508,
 'Geneviève Bujold': 509,
 'Ken Magee': 510,
 'Joe Lisi': 511,
 'Loyd Catlett': 512,
 'Robert Carradine': 513,
 'Michael Alldredge': 514,
 'Jimmy Sheirgill': 515,
 'Richard Masur': 516,
 'Ian Bannen': 517,
 'Laurence Olivier': 518,
 'William Prince': 519,
 'Richard B. Shull': 520,
 'Karl Johnson': 521,
 'Rita Moreno': 522,
 'Anthony De Longis': 523,
 'Sarah Trigger': 524,
 'Freddie Jones': 525,
 'Sanjay Mishra': 526,
 'David Proval': 527,
 'James Staley': 528,
 'Tom Villard': 529,
 'Donald Hotton': 530,
 'Jackie Burroughs': 531,
 'Amy Wright': 532,
 'Richard Mulligan': 533,
 'Diana Bellamy': 534,
 'Mark Duplass': 535,
 'Don Pugsley': 536,
 'Fisher Stevens': 537,
 'Ranbir Kapoor': 538,
 'Levon Helm': 539,
 'Adrian Dunbar': 540,
 'James Wilby': 541,
 'Jan Tríska': 542,
 'Rae Dawn Chong': 543,
 'Anne De Salvo': 544,
 'Tony Lo Bianco': 545,
 'Patricia Arquette': 546,
 'Jonathan Brandis': 547,
 'Alia Shawkat': 548,
 'Anna Chancellor': 549,
 'Nicholas Rowe': 550,
 'Manny Perez': 551,
 'Stacy Edwards': 552,
 'Dennis Lipscomb': 553,
 'Vic Tayback': 554,
 'Sam McMurray': 555,
 'Michael Laskin': 556,
 'Priscilla Pointer': 557,
 'Jan Rubes': 558,
 'Lee Wilkof': 559,
 'James Keach': 560,
 'Ken Foree': 561,
 'Spike Lee': 562,
 'John Dunn-Hill': 563,
 'Ben Gazzara': 564,
 'Kent Broadhurst': 565,
 "Dick O'Neill": 566,
 'James Urbaniak': 567,
 'Boman Irani': 568,
 'Michael Harding': 569,
 'Jennifer Jason Leigh': 570,
 'Roger Aaron Brown': 571,
 'Jaaved Jaaferi': 572,
 'Jane Kaczmarek': 573,
 'Sam Wanamaker': 574,
 'Kevin Conway': 575,
 'Raleigh Bond': 576,
 'Maribel Verdú': 577,
 'Blue Deckert': 578,
 'Georgann Johnson': 579,
 'Richard Farnsworth': 580,
 'Lisa Jane Persky': 581,
 'Billie Whitelaw': 582,
 'Lindsay Crouse': 583,
 'Kajol': 584,
 'Sonam Kapoor': 585,
 'Nicole Beharie': 586,
 'Bruce M. Fischer': 587,
 'James Mason': 588,
 'Aishwarya Rai Bachchan': 589,
 'Pam Grier': 590,
 'Richard Lynch': 591,
 'Sonny Carl Davis': 592,
 'Marsha Mason': 593,
 'Peter Horton': 594,
 'Steve McQueen': 595,
 'Kabir Bedi': 596,
 'Sully Boyar': 597,
 'Audra Lindley': 598,
 'Shelley Duvall': 599,
 'Dick Van Patten': 600,
 'Billy Beck': 601,
 'Sanjay Dutt': 602,
 'Dana Delany': 603,
 'Kelly Jo Minter': 604,
 'David Hart': 605,
 'Jeremy Northam': 606,
 'Don Hood': 607,
 'Keith Coogan': 608,
 'Ronald Guttman': 609,
 'Stanley Brock': 610,
 'Hart Bochner': 611,
 'William Smith': 612,
 'Peter Elliott': 613,
 'Mike Moroff': 614,
 'John Ritter': 615,
 'Lolita Davidovich': 616,
 'Ben Shenkman': 617,
 'Bob Minor': 618,
 'Todd Graff': 619,
 'Bo Hopkins': 620,
 'Mamie Gummer': 621,
 'Maria Dizzia': 622,
 'Ranjit Chowdhry': 623,
 'Phoebe Cates': 624,
 'Paul Benjamin': 625,
 'Pauly Shore': 626,
 'Del Close': 627,
 'Miriam Colon': 628,
 'Ally Sheedy': 629,
 'Joanna Pacula': 630,
 'Julie Warner': 631,
 'Allan Corduner': 632,
 'William Ragsdale': 633,
 'Tom Heaton': 634,
 'Norman Fell': 635,
 'Mike Pniewski': 636,
 'Vincent Price': 637,
 'Renée Taylor': 638,
 'Ed Grady': 639,
 'Ian Gomez': 640,
 'Nicollette Sheridan': 641,
 'William Traylor': 642,
 'Dana Wheeler-Nicholson': 643,
 'Brian McNamara': 644,
 'Method Man': 645,
 'John Sessions': 646,
 'Graham Stark': 647,
 'Paul Le Mat': 648,
 'Kenneth Tigar': 649,
 'Woody Allen': 650,
 'Marvin J. McIntyre': 651,
 'Jacqueline Fernandez': 652,
 'Romola Garai': 653,
 "Sean 'Diddy' Combs": 654,
 'Eloy Casados': 655,
 'Helen Hanft': 656,
 'John Saxon': 657,
 'Howard Hesseman': 658,
 'Dirk Blocker': 659,
 'Shelley Long': 660,
 'David Bradley': 661,
 'Brooke Shields': 662,
 'Taylor Negron': 663,
 'Meagen Fay': 664,
 'Judith Ivey': 665,
 'Alexandra Holden': 666,
 'Phyllis Somerville': 667,
 'Nathan Davis': 668,
 'John Houseman': 669,
 'George Plimpton': 670,
 'Michael J. Pagan': 671,
 'Marco Rodríguez': 672,
 'Robert Vaughn': 673,
 'Damon Wayans': 674,
 'Karen Young': 675,
 'Farrah Fawcett': 676,
 'Billy Barty': 677,
 'Gillian Jacobs': 678,
 'Mark Bringelson': 679,
 'Haviland Morris': 680,
 'Hill Harper': 681,
 'William Petersen': 682,
 "Annette O'Toole": 683,
 'Clifford A. Pellow': 684,
 'Irène Jacob': 685,
 'Eric Schweig': 686,
 'Mark Tandy': 687,
 'Emilio Estevez': 688,
 'Wilbur Fitzgerald': 689,
 'Brenda Vaccaro': 690,
 'Nelsan Ellis': 691,
 'Ron Silver': 692,
 'Edith Fields': 693,
 'Malinda Williams': 694,
 'Talia Balsam': 695,
 'George Coe': 696,
 'Gene Hartline': 697,
 'J.W. Smith': 698,
 'Peter MacNeill': 699,
 'Sarita Choudhury': 700,
 'Rupert Graves': 701,
 'Carol Sutton': 702,
 'Kim Greist': 703,
 'Philip Bruns': 704,
 'Joyce Brothers': 705,
 'Nicholas Farrell': 706,
 'Michael J. Pollard': 707,
 "Dan O'Herlihy": 708,
 'Jon Foster': 709,
 'Lee Richardson': 710,
 'Katrina Kaif': 711,
 'Willard E. Pugh': 712,
 'Rupert Frazer': 713,
 'Robert Webber': 714,
 'Annabella Sciorra': 715,
 'Zulay Henao': 716,
 'John Doe': 717,
 'Gailard Sartain': 718,
 'Welker White': 719,
 'Pepe Serna': 720,
 'James Pickens Jr.': 721,
 'Rosanna Arquette': 722,
 'Spalding Gray': 723,
 'Catherine McCormack': 724,
 'Graham Jarvis': 725,
 'Cynthia Stevenson': 726,
 'Lauren Hutton': 727,
 'Salman Khan': 728,
 'Paula Garcés': 729,
 'Eddie Cibrian': 730,
 'Helen Lloyd Breed': 731,
 'Jeffrey Nordling': 732,
 'Denise Crosby': 733,
 'Cindy Williams': 734,
 'Kim Delaney': 735,
 'James Whitmore': 736,
 'Daniel Benzali': 737,
 'Kathryn Erbe': 738,
 'Burke Byrnes': 739,
 'Nancy Marchand': 740,
 'Paul Sorvino': 741,
 'D.W. Moffett': 742,
 'Bill Bellamy': 743,
 'Ray Walston': 744,
 'Amy Madigan': 745,
 'Carl Lumbly': 746,
 'L.Q. Jones': 747,
 'Amy Irving': 748,
 'Terence Kelly': 749,
 'Al Fann': 750,
 'Lane Smith': 751,
 'Bono': 752,
 'David Byrd': 753,
 'Kai Wulff': 754,
 'Sushant Singh Rajput': 755,
 'Om Puri': 756,
 'Mary J. Blige': 757,
 'Beau Bridges': 758,
 'Richard Bright': 759,
 'Kevin Heffernan': 760,
 'Sean Young': 761,
 'Lyman Ward': 762,
 'Klaus Kinski': 763,
 'Dwier Brown': 764,
 'Kevin McCarthy': 765,
 'Mario Van Peebles': 766,
 'David L. Lander': 767,
 'Steven Randazzo': 768,
 'Joseph Maher': 769,
 'Hanns Zischler': 770,
 'Ebbe Roe Smith': 771,
 'Dey Young': 772,
 'Cindy Pickett': 773,
 'Faye Dunaway': 774,
 'Frank Adonis': 775,
 'Annabeth Gish': 776,
 'Joan Chen': 777,
 'Charles Levin': 778,
 'Leslie Caron': 779,
 'Roberto Sosa': 780,
 'Brett Rice': 781,
 'George W. Bush': 782,
 'Louis Gossett Jr.': 783,
 'James Eckhouse': 784,
 'Warren Oates': 785,
 'Carol Kane': 786,
 'Arthur J. Nascarella': 787,
 'Chelsea Field': 788,
 'Tim Ware': 789,
 'James Biberi': 790,
 'Jérémie Renier': 791,
 'Karina Arroyave': 792,
 'Nancy Allen': 793,
 'Tom Savini': 794,
 'Dudley Moore': 795,
 'Joe Santos': 796,
 "Michael O'Keefe": 797,
 'Lauren London': 798,
 'Jack Thibeau': 799,
 'Andrew Robinson': 800,
 'John Standing': 801,
 'Obba Babatundé': 802,
 'Gregory Walcott': 803,
 'Todd Allen': 804,
 'Gregory Hines': 805,
 'Andrew McCarthy': 806,
 'John Gallagher Jr.': 807,
 'Peter Bromilow': 808,
 'Darrell Larson': 809,
 'Judy Davis': 810,
 'George C. Scott': 811,
 'Doris Roberts': 812,
 'Sheri Moon Zombie': 813,
 'James Gammon': 814,
 'Schuyler Fisk': 815,
 'Eddie Velez': 816,
 'Natasha Gregson Wagner': 817,
 'Robert Miano': 818,
 'Isaac Hayes': 819,
 'Anupam Kher': 820,
 'Jason Beghe': 821,
 'Kadeem Hardison': 822,
 'Roger Cross': 823,
 'Naushaad Abbas': 824,
 'Judy Parfitt': 825,
 'Connie Britton': 826,
 'Rockets Redglare': 827,
 'Janet Carroll': 828,
 'Lisa Blount': 829,
 'Max Wright': 830,
 "Patti D'Arbanville": 831,
 'Chuck Shamata': 832,
 'Larry Flash Jenkins': 833,
 'Chuck McCann': 834,
 'Kaki Hunter': 835,
 'Alan North': 836,
 'F. William Parker': 837,
 'Stephen E. Miller': 838,
 'Margaret Whitton': 839,
 'Ranveer Singh': 840,
 'Richard C. Sarafian': 841,
 'Paresh Rawal': 842,
 'Sheila Kelley': 843,
 'John de Lancie': 844,
 'Woodrow Parfrey': 845,
 'Brian Keith': 846,
 'Eric Bogosian': 847,
 'David Hayman': 848,
 'Diane Ladd': 849,
 'Melanie Griffith': 850,
 'Clifton James': 851,
 'Helen Martin': 852,
 'Fenella Woolgar': 853,
 'Jerry Levine': 854,
 'Louis Herthum': 855,
 'Ludivine Sagnier': 856,
 'David Selby': 857,
 'Robert Swan': 858,
 'Mel Winkler': 859,
 'Candy Clark': 860,
 'Gretchen Mol': 861,
 'J. Kenneth Campbell': 862,
 'John Hancock': 863,
 'Neil Ross': 864,
 'Jacqueline Brookes': 865,
 'Danny Webb': 866,
 'Sam Bottoms': 867,
 'Penelope Allen': 868,
 'Ken Lerner': 869,
 'John Philbin': 870,
 'J. Smith-Cameron': 871,
 'John Sharian': 872,
 'Essence Atkins': 873,
 'Alex Kendrick': 874,
 'Christopher Curry': 875,
 'Dennis Burkley': 876,
 'Christopher Abbott': 877,
 'Naseeruddin Shah': 878,
 'Frank Baker': 879,
 'Juliet Stevenson': 880,
 'George Wendt': 881,
 'Brandon Smith': 882,
 'Rony Clanton': 883,
 'David Wohl': 884,
 'Mike White': 885,
 'Greta Scacchi': 886,
 'Daniel Hugh Kelly': 887,
 'Jerry Hardin': 888,
 'Devon Sawa': 889,
 'Ira Wheeler': 890,
 "Ryan O'Neal": 891,
 'Fred Astaire': 892,
 'Paula Jai Parker': 893,
 "Mo'Nique": 894,
 'Willie Nelson': 895,
 'Peter Strauss': 896,
 'Bess Armstrong': 897,
 'Lou Diamond Phillips': 898,
 'Roseanne Barr': 899,
 'Geoffrey Lewis': 900,
 'Marc Lawrence': 901,
 'André Dussollier': 902,
 'Daniel Gerroll': 903,
 'Angel David': 904,
 'Edward Fox': 905,
 'Tia Texada': 906,
 'Bibi Besch': 907,
 'Rick Overton': 908,
 'Michael McGrady': 909,
 'Lynn Redgrave': 910,
 'Dyan Cannon': 911,
 'William Holden': 912,
 'Aidan Quinn': 913,
 'Ellen Barkin': 914,
 'John Terry': 915,
 'Badja Djola': 916,
 'Nicholas Guest': 917,
 'Leslie Easterbrook': 918,
 'Yaphet Kotto': 919,
 'Herta Ware': 920,
 'Maddie Corman': 921,
 'Ajay Naidu': 922,
 'Tamala Jones': 923,
 'Catherine Hicks': 924,
 'Adam LeFevre': 925,
 'Art Hindle': 926,
 'Bridget Fonda': 927,
 'Eileen Ryan': 928,
 'Ed Begley Jr.': 929,
 'Nina Foch': 930,
 'Raphael Sbarge': 931,
 'Charles Cyphers': 932,
 'Martin Savage': 933,
 'George Wallace': 934,
 'François Chau': 935,
 'Franklyn Ajaye': 936,
 'Ernie Lively': 937,
 'Tito Larriva': 938,
 "Terry O'Quinn": 939,
 'Diana Scarwid': 940,
 'Michael Currie': 941,
 'Burt Kwouk': 942,
 'Geoffrey Arend': 943,
 'Lance Kinsey': 944,
 'Melissa Sagemiller': 945,
 'Ellen Greene': 946,
 'Phyllida Law': 947,
 'Lisa Eichhorn': 948,
 'Burt Lancaster': 949,
 'Tom Bower': 950,
 'Amitabh Bachchan': 951,
 'Lilyan Chauvin': 952,
 'John Gielgud': 953,
 'Ed Lauter': 954,
 'Kari Wuhrer': 955,
 'J.C. Quinn': 956,
 'Thomas Jefferson Byrd': 957,
 'Duane Martin': 958,
 'Scott Paulin': 959,
 'Brad Sullivan': 960,
 'Scott Wilson': 961,
 'Patrick Cranshaw': 962,
 'Martha Higareda': 963,
 'James Hampton': 964,
 'Becky Ann Baker': 965,
 'Claire Bloom': 966,
 'Ray Wise': 967,
 'Billy Drago': 968,
 'Ruth McCabe': 969,
 'Jack Warden': 970,
 'Susanna Thompson': 971,
 'Wilfrid Hyde-White': 972,
 'Scott Burkholder': 973,
 'Rufus': 974,
 'Scatman Crothers': 975,
 'Brian Hooks': 976,
 'Chris Sarandon': 977,
 'Barry Primus': 978,
 'Robert Joy': 979,
 'Sharat Saxena': 980,
 'Randall Batinkoff': 981,
 'Efren Ramirez': 982,
 'Paul Reubens': 983,
 'Charles Haid': 984,
 'Graham Beckel': 985,
 'George Kennedy': 986,
 'Duncan Fraser': 987,
 'Matt Clark': 988,
 "Beverly D'Angelo": 989,
 'Bud Cort': 990,
 'Macy Gray': 991,
 'Gwen McGee': 992,
 'Martha Gehman': 993,
 'Roxanne Hart': 994,
 'Erich Anderson': 995,
 'Felicity Huffman': 996,
 'Brendan Sexton III': 997,
 'Natasha Richardson': 998,
 'Wayne Grace': 999,
 'Tom Towles': 1000,
 ...}


def getCastsTeamRank(casts):
    total = 0
    for cast in casts:
        if cast not in castRank.keys():
            total += random.randint(1, 200)
            continue
        total += castRank[cast]
    return total


data2 = data.copy()
data2['CastsRank'] = data2['Cast'].apply(getCastsTeamRank)
data2['CastsRank'].value_counts()

2055     6
0        5
17485    4
1806     4
1953     4
        ..
27199    1
4658     1
27183    1
33941    1
31443    1
Name: CastsRank, Length: 7779, dtype: int64


corr = pearsonr(data2['CastsRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CastsRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CastsRank', y='Gross_worldwide', color='brown')

Pearsons correlation between CastsRank and Gross: 0.526

<AxesSubplot:xlabel='CastsRank', ylabel='Gross_worldwide'>


final['CastsRank'] = data2['CastsRank']


def getCastsTeamRank(casts):
    length = len(casts)
    if length == 0:
        length = 1
    total = 0
    for cast in casts:
        if cast in castRank.keys():
            total += castRank[cast]
    return total / length


data2 = data.copy()
data2['CastsRank'] = data2['Cast'].apply(getCastsTeamRank)
data2['CastsRank'].value_counts()
corr = pearsonr(data2['CastsRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CastsRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CastsRank', y='Gross_worldwide', color='brown')

Pearsons correlation between CastsRank and Gross: 0.513

<AxesSubplot:xlabel='CastsRank', ylabel='Gross_worldwide'>


cast10Movies.sort_values(by='Mean', ascending=False, inplace=True)
top100Cast = list(cast10Movies['Cast'][0:100])


def getNumLeadActors(casts):
    total = 0
    for cast in casts:
        if cast in top100Cast:
            total += 1
    return total


data2 = data.copy()
data2['NumLeadActors'] = data2['Cast'].apply(getNumLeadActors)
data2['NumLeadActors'].value_counts()

0     7988
1      574
2      103
3       34
4       22
5       15
7        5
9        4
8        3
6        2
11       2
Name: NumLeadActors, dtype: int64


corr = pearsonr(data2['NumLeadActors'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumLeadActors and Gross: %.3f' % corr)
data2.plot.scatter(x='NumLeadActors', y='Gross_worldwide', color='brown')

Pearsons correlation between NumLeadActors and Gross: 0.593

<AxesSubplot:xlabel='NumLeadActors', ylabel='Gross_worldwide'>


final['NumLeadActors'] = data2['NumLeadActors']


cast10Movies.sort_values(by='Mean', ascending=False, inplace=True)
top50Cast = list(cast10Movies['Cast'][0:50])


def getHasTop30Actors(casts):
    for cast in casts:
        if cast in top50Cast:
            return 1
    return 0


data2 = data.copy()
data2['HasTop50Actors'] = data2['Cast'].apply(getHasTop30Actors)
corr = pearsonr(data2['HasTop50Actors'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between HasTop50Actors and Gross: %.3f' % corr)
data2.plot.scatter(x='HasTop50Actors', y='Gross_worldwide', color='brown')

Pearsons correlation between HasTop50Actors and Gross: 0.364

<AxesSubplot:xlabel='HasTop50Actors', ylabel='Gross_worldwide'>


final['HasTop50Actors'] = data2['HasTop50Actors']


data2 = data.copy()
data2['NumCrews'] = data2['Crew'].apply(lambda x: len(x))
corr = pearsonr(data2['NumCrews'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumCrews and Gross: %.3f' % corr)
data2.plot.scatter(x='NumCrews', y='Gross_worldwide', color='brown')

Pearsons correlation between NumCrews and Gross: 0.190

<AxesSubplot:xlabel='NumCrews', ylabel='Gross_worldwide'>


final['NumCrews'] = data2['NumCrews']


crew = parseWithMoneyAndCount(data, 'Crew')
crew.sort_values(by='Count', ascending=False)


crew.sort_values(by='Count', ascending=False)[0:20].plot.bar(x='Crew', y='Count', color='green')

<AxesSubplot:xlabel='Crew'>


crew.sort_values(by='Mean', ascending=False)[0:20]


releases4crew = crew[crew['Count'] > 4]
releases4crew.sort_values(by='Mean', ascending=False)[0:20]


fig = plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.bar(data=crew.sort_values(by='Mean', ascending=False)[0:20], x='Crew', height='Mean')
plt.xticks(rotation=90)
plt.title("Mean Plot")
plt.subplot(1, 2, 2)
plt.bar(data=crew.sort_values(by='Median', ascending=False)[0:20], x='Crew', height='Median')
# crew.sort_values(by='Mean',ascending=False)[0:20].plot.bar(x='Crew',y='Mean')
plt.xticks(rotation=90)
plt.title("Median Plot")
plt.show()


crewRank = dict()
releases4crew = releases4crew.sort_values(by='Mean').reset_index(drop=True)
for i, row in enumerate(releases4crew['Crew']):
    crewRank[row] = i + 1
crewRank

{'Federico Fellini': 1,
 'André Téchiné': 2,
 'Ingmar Bergman': 3,
 'Todd Solondz': 4,
 'Errol Morris': 5,
 'Warren Miller': 6,
 'Bill Forsyth': 7,
 'Krzysztof Kieslowski': 8,
 'Krzysztof Piesiewicz': 9,
 'Sally Potter': 10,
 'Louis Malle': 11,
 'Mark Monroe': 12,
 'Henry James': 13,
 'Alan Rudolph': 14,
 'Zalman King': 15,
 'Hanif Kureishi': 16,
 'Atom Egoyan': 17,
 'Naomi Foner': 18,
 'François Truffaut': 19,
 'Bille August': 20,
 'Werner Herzog': 21,
 'Desmond Nakano': 22,
 'Alfred Hitchcock': 23,
 'Agnieszka Holland': 24,
 'Sam Peckinpah': 25,
 'Olivier Assayas': 26,
 'Harold Pinter': 27,
 'Patrice Leconte': 28,
 'Laura Jones': 29,
 'Allan Scott': 30,
 'Billy Wilder': 31,
 'Don Coscarelli': 32,
 'Whit Stillman': 33,
 'Sergio Donati': 34,
 'David Zelag Goodman': 35,
 'James R. Silke': 36,
 'Robin Bhatt': 37,
 'Frank Cottrell Boyce': 38,
 'Menahem Golan': 39,
 'Michael Winterbottom': 40,
 'Neal Jimenez': 41,
 'Farhan Akhtar': 42,
 'Denys Arcand': 43,
 'Uwe Boll': 44,
 'Eric Red': 45,
 'J. Lee Thompson': 46,
 'John Huston': 47,
 'Stanley Donen': 48,
 'Albert Brooks': 49,
 'Sam Firstenberg': 50,
 'Terry Jones': 51,
 'Bob Rafelson': 52,
 'Monica Mcgowan Johnson': 53,
 'Aaron Norris': 54,
 'Anvita Dutt': 55,
 'Oren Moverman': 56,
 'Gérard Brach': 57,
 'Ken Hixon': 58,
 'Darin Scott': 59,
 'Ruth Prawer Jhabvala': 60,
 'Christopher Guest': 61,
 'Susan Seidelman': 62,
 'Michael Thomas': 63,
 'Ken Russell': 64,
 'Christopher Crowe': 65,
 'E. Max Frye': 66,
 'John Irvin': 67,
 'Mike Leigh': 68,
 'David Lynch': 69,
 'Richard Fleischer': 70,
 'Paul Schrader': 71,
 'Jérôme Tonnerre': 72,
 'Jim Jarmusch': 73,
 'Emilio Estevez': 74,
 'Sergio Leone': 75,
 'Albert Pyun': 76,
 'Imtiaz Ali': 77,
 'Nicole Holofcener': 78,
 'Anurag Kashyap': 79,
 'Michael Winner': 80,
 'Eugene Levy': 81,
 'Bill Norton': 82,
 'James Ivory': 83,
 'Randall Miller': 84,
 'Mira Nair': 85,
 'Michael Dowse': 86,
 'Richard Pearce': 87,
 'Robert Mandel': 88,
 'John Dahl': 89,
 'Priyadarshan': 90,
 'Charlie Peters': 91,
 'Jane Campion': 92,
 'Roland Joffé': 93,
 'Tom Ropelewski': 94,
 'Sidney J. Furie': 95,
 'Michael Radford': 96,
 'Todd Haynes': 97,
 'Carroll Ballard': 98,
 'Gillian Armstrong': 99,
 'Ulu Grosbard': 100,
 'Fred Schepisi': 101,
 'Stewart Raffill': 102,
 'John Curran': 103,
 'Davis Guggenheim': 104,
 'John Sayles': 105,
 'Tony Bill': 106,
 'David Shaber': 107,
 'Alan Ormsby': 108,
 'Todd Graff': 109,
 'David Mickey Evans': 110,
 'Luciano Vincenzoni': 111,
 'Ernest R. Dickerson': 112,
 'Thom Eberhardt': 113,
 'Clive Barker': 114,
 'Peter Yates': 115,
 'John Briley': 116,
 'Jacques Audiard': 117,
 'Bernardo Bertolucci': 118,
 'William Shakespeare': 119,
 'Brian Garfield': 120,
 'Alan Bennett': 121,
 'Christopher Cain': 122,
 'Michael Schultz': 123,
 'Ben Hecht': 124,
 'John Boorman': 125,
 'Rudy De Luca': 126,
 'Martha Coolidge': 127,
 'Douglas McGrath': 128,
 'Franco Zeffirelli': 129,
 'Lars von Trier': 130,
 'Phil Joanou': 131,
 'James Toback': 132,
 'Bruce Robinson': 133,
 'Sanjay Leela Bhansali': 134,
 'Jeff Kanew': 135,
 'E.M. Forster': 136,
 'Sidney Lumet': 137,
 'Rick Rosenthal': 138,
 'Paul Mazursky': 139,
 'Robert Townsend': 140,
 'Michael Haneke': 141,
 'Mike Binder': 142,
 'Steven Pressfield': 143,
 'Miguel Tejada-Flores': 144,
 'Andrew Fleming': 145,
 'Mike Figgis': 146,
 'Susanne Bier': 147,
 'John Frankenheimer': 148,
 'Hal Ashby': 149,
 'Siddharth Anand': 150,
 'John Crowley': 151,
 'Steve Rash': 152,
 'Daniel Petrie': 153,
 'Richard Eyre': 154,
 'Richard Linklater': 155,
 'Edward Tang': 156,
 'Joe Camp': 157,
 'Mark L. Lester': 158,
 'Karan Johar': 159,
 'Niranjan Iyengar': 160,
 'Clifford Green': 161,
 'Don Siegel': 162,
 'Rohit Shetty': 163,
 'Anees Bazmee': 164,
 'Jean-Claude Carrière': 165,
 'Farhad Samji': 166,
 'Tim Metcalfe': 167,
 'John R. Cherry III': 168,
 'Robert Altman': 169,
 'Michael Cimino': 170,
 'Vidhu Vinod Chopra': 171,
 'John Schlesinger': 172,
 'Peter Bogdanovich': 173,
 'Prakash Kapadia': 174,
 'David Rayfiel': 175,
 'Paul Brickman': 176,
 'Jay Presson Allen': 177,
 'Rick Famuyiwa': 178,
 'Yunus Sajawal': 179,
 'David Cronenberg': 180,
 'Billy Bob Thornton': 181,
 'Brent Goldberg': 182,
 'Sajid': 183,
 'Paul Dehn': 184,
 'Barbet Schroeder': 185,
 'Martin Ritt': 186,
 'Sean McNamara': 187,
 "Pat O'Connor": 188,
 'Jonathan Kaplan': 189,
 'Howard Franklin': 190,
 'James Carabatsos': 191,
 'Bill Duke': 192,
 'Jeremy Brock': 193,
 'Blake Edwards': 194,
 'Tom Holland': 195,
 'Larry Gross': 196,
 'Dan Gordon': 197,
 'Robert Klane': 198,
 'Neil Simon': 199,
 'Gurinder Chadha': 200,
 'Paul Michael Glaser': 201,
 'Dean Riesner': 202,
 'James Bridges': 203,
 'Dennis Shryack': 204,
 'Miguel Arteta': 205,
 'Sean S. Cunningham': 206,
 'Larry Cohen': 207,
 'François Ozon': 208,
 'Richard Loncraine': 209,
 'Bill Lancaster': 210,
 'Kevin Smith': 211,
 'W.D. Richter': 212,
 'Ernest Lehman': 213,
 'Mary Agnes Donoghue': 214,
 'Richard Attenborough': 215,
 'Neil LaBute': 216,
 'Norman Jewison': 217,
 'David Newman': 218,
 'Jon Erwin': 219,
 'Stephen J. Rivele': 220,
 'William Dear': 221,
 'Brent Maddock': 222,
 'Hugh Hudson': 223,
 'Darryl Ponicsan': 224,
 'George A. Romero': 225,
 'Mark Pellington': 226,
 'Aditya Chopra': 227,
 'Charlie Kaufman': 228,
 'Paul Mayeda Berges': 229,
 'Nicholas Hytner': 230,
 'Chuck Konzelman': 231,
 'Cary Solomon': 232,
 'Chris Matheson': 233,
 'Brad Anderson': 234,
 'Franklin J. Schaffner': 235,
 'Bruce Beresford': 236,
 'Jean-Claude Van Damme': 237,
 'Richard Benjamin': 238,
 'Gene Quintano': 239,
 'John Stockwell': 240,
 'Cheech Marin': 241,
 'Jim Sheridan': 242,
 'Marshall Brickman': 243,
 'Tobe Hooper': 244,
 'Ron Shelton': 245,
 'Wim Wenders': 246,
 'Fred Dekker': 247,
 'Kar-Wai Wong': 248,
 'Arthur Hiller': 249,
 'Rob Zombie': 250,
 'Nicholas Kazan': 251,
 'Matthew Robbins': 252,
 'Michael Pressman': 253,
 'Bob Clark': 254,
 'Tommy Chong': 255,
 'Carl Reiner': 256,
 'Robert Benton': 257,
 'Carl Franklin': 258,
 'Bruce A. Evans': 259,
 'Woody Allen': 260,
 'Dalton Trumbo': 261,
 'Tom Mankiewicz': 262,
 'Pedro Almodóvar': 263,
 'Alan Parker': 264,
 'Nancy Dowd': 265,
 'Neil Jordan': 266,
 'DJ Pooh': 267,
 'Stephen Frears': 268,
 'Albert Magnoli': 269,
 'Reginald Hudlin': 270,
 'Chris Rock': 271,
 'Leon Capetanos': 272,
 'Michael Cristofer': 273,
 'Kevin Macdonald': 274,
 'Ted Kotcheff': 275,
 'Peter Dexter': 276,
 'Stirling Silliphant': 277,
 'John Irving': 278,
 'Jeremiah S. Chechik': 279,
 'Steve Zacharias': 280,
 'Terry George': 281,
 'Alan B. McElroy': 282,
 'Andy Breckman': 283,
 'Neal Israel': 284,
 'Spike Lee': 285,
 'Richard Lester': 286,
 'Jonathan Bernstein': 287,
 'Raynold Gideon': 288,
 'Stan Dragoti': 289,
 'David Hare': 290,
 'Christopher Hampton': 291,
 'Wayne Wang': 292,
 'Peter Straughan': 293,
 'Guy Hamilton': 294,
 'Dwight H. Little': 295,
 'Norman Steinberg': 296,
 'Michael Ritchie': 297,
 'Irwin Winkler': 298,
 'Lewis Teague': 299,
 'Stephen Kendrick': 300,
 'Simon Wincer': 301,
 'Don Jakoby': 302,
 'James Gray': 303,
 'Victor Salva': 304,
 'Steve Miner': 305,
 'Walter Hill': 306,
 'Michael Hoffman': 307,
 'Eric Bernt': 308,
 'Paul McGuigan': 309,
 'Michael Lehmann': 310,
 'John Waters': 311,
 'John Carpenter': 312,
 'Robert Getchell': 313,
 'Anders Thomas Jensen': 314,
 'Roman Polanski': 315,
 'Howard Deutch': 316,
 'Milos Forman': 317,
 'Herbert Ross': 318,
 'Andrew Klavan': 319,
 'Ice Cube': 320,
 'John Schultz': 321,
 'I.A.L. Diamond': 322,
 'Abi Morgan': 323,
 'Russell Mulcahy': 324,
 'Dave Eggers': 325,
 'Brett Leonard': 326,
 'Ian McEwan': 327,
 'Bo Goldman': 328,
 'Grégory Levasseur': 329,
 'Francis Veber': 330,
 'David Seltzer': 331,
 'Harry Elfont': 332,
 'John Guillermin': 333,
 'Gregory Widen': 334,
 'Mick Garris': 335,
 'Alastair Fothergill': 336,
 'Robert Bolt': 337,
 'David Loughery': 338,
 'Don Mancini': 339,
 'David Mamet': 340,
 'Vijay Krishna Acharya': 341,
 'Donald E. Westlake': 342,
 'Elizabeth Chandler': 343,
 'Jerry Juhl': 344,
 'Douglas Day Stewart': 345,
 'John Hodge': 346,
 'Matthew Stone': 347,
 'Ted Demme': 348,
 'Jonathan Lynn': 349,
 'Buck Henry': 350,
 'Howard Zieff': 351,
 'Alex Kendrick': 352,
 'Gary Goldman': 353,
 'Craig Gillespie': 354,
 'John le Carré': 355,
 'Harold Becker': 356,
 'Jay Chandrasekhar': 357,
 'Lewis Colick': 358,
 'Dick Clement': 359,
 'Deborah Kaplan': 360,
 'Robert Ramsey': 361,
 'Karen Janszen': 362,
 'Peter Hedges': 363,
 'Thomas Rickman': 364,
 'Lem Dobbs': 365,
 'Billy Crystal': 366,
 'Luis Mandoki': 367,
 'Steve Pink': 368,
 'Paul Thomas Anderson': 369,
 'Ian La Frenais': 370,
 'Mark Rydell': 371,
 'Iain Softley': 372,
 'Gus Van Sant': 373,
 'Allison Burnett': 374,
 'Charles Edward Pogue': 375,
 'Tina Gordon': 376,
 'Joe Dante': 377,
 'Terrence Malick': 378,
 'John Milius': 379,
 'Sofia Coppola': 380,
 'David Lean': 381,
 'John Kamps': 382,
 'Dennis Feldman': 383,
 'Jane Austen': 384,
 'Ken Kwapis': 385,
 'James Ellroy': 386,
 'Ron Underwood': 387,
 'Peter Hyams': 388,
 'Victor Miller': 389,
 'Andrew Bergman': 390,
 'Akira Kurosawa': 391,
 'Sheldon Lettich': 392,
 'Elmore Leonard': 393,
 'Gary Fleder': 394,
 'Charles Shyer': 395,
 'Michael Moore': 396,
 'Terry Gilliam': 397,
 'Heywood Gould': 398,
 'Michael Caton-Jones': 399,
 'Adam Rifkin': 400,
 'Steven Knight': 401,
 'Abhijat Joshi': 402,
 'Mitch Glazer': 403,
 'Joseph Barbera': 404,
 'Don Bluth': 405,
 'Rita M. Fink': 406,
 'Harry Julian Fink': 407,
 'William Friedkin': 408,
 'Mark Neveldine': 409,
 'Craig Bolotin': 410,
 'Hugh Wilson': 411,
 'Robert Towne': 412,
 'Jean-Jacques Annaud': 413,
 "Gavin O'Connor": 414,
 'Kathryn Bigelow': 415,
 'Mark Brown': 416,
 'Andrew Birkin': 417,
 'Joe Roth': 418,
 'J. Mills Goodloe': 419,
 'Brian Robbins': 420,
 'Scott Hicks': 421,
 'Jonathan Demme': 422,
 'Jeannot Szwarc': 423,
 'Stephen Hopkins': 424,
 'Roger Kumble': 425,
 'Walon Green': 426,
 'Eli Roth': 427,
 'Carol Sobieski': 428,
 'Stuart Gordon': 429,
 'Hal Needham': 430,
 'Ron Nyswaner': 431,
 'Peter Chelsom': 432,
 'Steve Martin': 433,
 'Thomas Meehan': 434,
 'Tom Tykwer': 435,
 'Donald Petrie': 436,
 'Stanley Kubrick': 437,
 'Tom Stoppard': 438,
 'George Tillman Jr.': 439,
 'Ronald Harwood': 440,
 'John G. Avildsen': 441,
 'Ronny Yu': 442,
 'Tyler Perry': 443,
 'Danny DeVito': 444,
 'Robert Redford': 445,
 'Taylor Hackford': 446,
 'David S. Ward': 447,
 'Dale Launer': 448,
 'Mary Shelley': 449,
 'Brian Taylor': 450,
 'Stephen King': 451,
 'Joseph Ruben': 452,
 'George Roy Hill': 453,
 'Charles Martin Smith': 454,
 'John Pogue': 455,
 'Harris Goldberg': 456,
 'Larry Ferguson': 457,
 'Steven Brill': 458,
 'John Patrick Shanley': 459,
 'Robert King': 460,
 'Chuck Pfarrer': 461,
 'Sam Weisman': 462,
 'Babaloo Mandel': 463,
 'George Gallo': 464,
 'Caroline Thompson': 465,
 'Randal Kleiser': 466,
 'Tom Schulman': 467,
 'Richard Price': 468,
 'William Goldman': 469,
 'Jonathan Levine': 470,
 'Josh Stolberg': 471,
 'Peter Hewitt': 472,
 'Rod Daniel': 473,
 'Thomas Carter': 474,
 'Jay Cocks': 475,
 'Don Roos': 476,
 'Phil Alden Robinson': 477,
 'John Glen': 478,
 'Francis Ford Coppola': 479,
 'Wes Craven': 480,
 'Alexandre Aja': 481,
 'John Badham': 482,
 'Larry Karaszewski': 483,
 'Mel Brooks': 484,
 'Debra Hill': 485,
 'Alex Garland': 486,
 'Scott Alexander': 487,
 'Lowell Ganz': 488,
 'Albert S. Ruddy': 489,
 'James Schamus': 490,
 'Wes Anderson': 491,
 'Michael Grais': 492,
 'Robert Wise': 493,
 'John Landis': 494,
 'Hark Tsui': 495,
 'John Whitesell': 496,
 'John Romano': 497,
 'Rob Reiner': 498,
 'Charles Dickens': 499,
 'Andrzej Bartkowiak': 500,
 'Malcolm D. Lee': 501,
 'Les Mayfield': 502,
 'David Webb Peoples': 503,
 'Michel Gondry': 504,
 'James Orr': 505,
 'Michael Apted': 506,
 'Sarah Kernochan': 507,
 'Alan J. Pakula': 508,
 'Nick Hornby': 509,
 'Barry Levinson': 510,
 'Curtis Hanson': 511,
 'Stephen Herek': 512,
 'Roger Donaldson': 513,
 'Jon Avnet': 514,
 'Wesley Strick': 515,
 'Gregory Hoblit': 516,
 'Nick Castle': 517,
 'Roger Michell': 518,
 'Joe Eszterhas': 519,
 'George Clooney': 520,
 'David Gordon Green': 521,
 'Warren Beatty': 522,
 'Pat Proft': 523,
 'James McTeigue': 524,
 'Greg Mottola': 525,
 'Allen Hughes': 526,
 'Frank Oz': 527,
 'Mike Rich': 528,
 'Lewis Gilbert': 529,
 'Paul Rudnick': 530,
 'S.S. Wilson': 531,
 'John Fusco': 532,
 'Larry McMurtry': 533,
 'Leonard Nimoy': 534,
 'Richard LaGravenese': 535,
 'Jim Henson': 536,
 'Aaron Seltzer': 537,
 'Jason Friedberg': 538,
 'Glen Morgan': 539,
 'Mike White': 540,
 'Penny Marshall': 541,
 'Jon Amiel': 542,
 'Mike Nichols': 543,
 'Bill Walsh': 544,
 'Diablo Cody': 545,
 'Hal Barwood': 546,
 'Mark Rosenthal': 547,
 'Brian De Palma': 548,
 'Richard Maibaum': 549,
 'Gerald Di Pego': 550,
 'Amy Holden Jones': 551,
 'Albert Uderzo': 552,
 'René Goscinny': 553,
 'Penelope Spheeris': 554,
 'George P. Cosmatos': 555,
 'Lasse Hallström': 556,
 'Jean-Pierre Jeunet': 557,
 'David Levien': 558,
 'Brian Koppelman': 559,
 'Michael Miner': 560,
 'David Zucker': 561,
 'Ann Biderman': 562,
 'Oliver Stone': 563,
 'Keenen Ivory Wayans': 564,
 'Robert Harling': 565,
 'Jason Reitman': 566,
 'Larry Gelbart': 567,
 'Yimou Zhang': 568,
 'Amy Heckerling': 569,
 'Frank Pierson': 570,
 'Daniel Waters': 571,
 'Pierre Morel': 572,
 'Jim Kouf': 573,
 'Simon Wells': 574,
 'Kirk Jones': 575,
 'Philip Kaufman': 576,
 'Nicholas Meyer': 577,
 'Roger Spottiswoode': 578,
 'Spike Jonze': 579,
 'James DeMonaco': 580,
 'Ethan Coen': 581,
 'Joel Coen': 582,
 'Edgar Rice Burroughs': 583,
 'Michael Schiffer': 584,
 'Leslie Bohem': 585,
 'Herschel Weingrod': 586,
 'Kunihiko Yuyama': 587,
 'John Singleton': 588,
 'Michael G. Wilson': 589,
 'Albert Hughes': 590,
 'Danny Boyle': 591,
 'Harold Ramis': 592,
 'Fred Wolf': 593,
 'Michael Tolkin': 594,
 'Satoshi Tajiri': 595,
 'Gloria Katz': 596,
 'Jessie Nelson': 597,
 'David R. Ellis': 598,
 'Mark Andrus': 599,
 'David Twohy': 600,
 'Renny Harlin': 601,
 'James Foley': 602,
 'Nora Ephron': 603,
 'Jim Taylor': 604,
 'Marcus Dunstan': 605,
 'Patrick Melton': 606,
 'Alexandre Dumas': 607,
 'Allan Loeb': 608,
 'Noah Baumbach': 609,
 'Cameron Crowe': 610,
 'Julian Fellowes': 611,
 'Jeremy Leven': 612,
 'Robin Swicord': 613,
 'Mark Steven Johnson': 614,
 'Delia Ephron': 615,
 'Nicholas Sparks': 616,
 'John Ridley': 617,
 'Joel Schumacher': 618,
 'Ted Griffin': 619,
 'John Madden': 620,
 'David Bowers': 621,
 'Jim Abrahams': 622,
 'Kaige Chen': 623,
 'Catherine Hardwicke': 624,
 'Jeb Stuart': 625,
 'Peter Tolan': 626,
 'Glenn Ficarra': 627,
 'Andrew Davis': 628,
 'Anthony Minghella': 629,
 'Daniel Pyne': 630,
 'George Cukor': 631,
 'Terry Hayes': 632,
 'William D. Wittliff': 633,
 'Peter Weir': 634,
 'Robert Rodriguez': 635,
 'Martin Scorsese': 636,
 'Tom S. Parker': 637,
 'Ted Tally': 638,
 'Marlon Wayans': 639,
 'Willard Huyck': 640,
 'Gary Winick': 641,
 'John Hughes': 642,
 'Wilfred Jackson': 643,
 'Steve Carr': 644,
 'Roger Avary': 645,
 'Mark Waters': 646,
 'William Hanna': 647,
 'Nick Cassavetes': 648,
 'Clint Eastwood': 649,
 'Steven Soderbergh': 650,
 'Jerry Belson': 651,
 'Kenneth Lonergan': 652,
 'Hossein Amini': 653,
 'Ronald Bass': 654,
 'George Miller': 655,
 'Clyde Geronimi': 656,
 'Arthur Conan Doyle': 657,
 'Roald Dahl': 658,
 'Lee Tamahori': 659,
 'Sydney Pollack': 660,
 'Richard Matheson': 661,
 'Michael H. Weber': 662,
 'Scott Neustadter': 663,
 'Frank Darabont': 664,
 'Edward Neumeier': 665,
 'Christopher Landon': 666,
 'Steven Rogers': 667,
 'Andy Fickman': 668,
 'Joel Cohen': 669,
 'Michael Mann': 670,
 'William Osborne': 671,
 'Stephen Gaghan': 672,
 'Paul Weitz': 673,
 'Stieg Larsson': 674,
 'Justin Haythe': 675,
 'Johnny Knoxville': 676,
 'Brian Helgeland': 677,
 'Dominic Sena': 678,
 'Garry Marshall': 679,
 'Barry W. Blaustein': 680,
 'Len Blum': 681,
 'Dan Aykroyd': 682,
 'Phillip Noyce': 683,
 'Ed Decter': 684,
 'Lawrence Konner': 685,
 'Joe Wright': 686,
 'Darren Aronofsky': 687,
 'Kevin Williamson': 688,
 'John Thomas': 689,
 'Evan Goldberg': 690,
 'David Dobkin': 691,
 'Gregory Poirier': 692,
 'Bobby Farrelly': 693,
 'Audrey Wells': 694,
 'Juliet Snowden': 695,
 'Joe Carnahan': 696,
 'Judd Apatow': 697,
 'Frank Miller': 698,
 'Harald Zwart': 699,
 'Simon Pegg': 700,
 'Dennis Lehane': 701,
 'Nat Mauldin': 702,
 'Menno Meyjes': 703,
 'Ben Stiller': 704,
 'Mario Puzo': 705,
 'Ishirô Honda': 706,
 'Emile Ardolino': 707,
 'David O. Russell': 708,
 'Timothy Harris': 709,
 'Leslie Dixon': 710,
 'Wolfgang Reitherman': 711,
 'Gabriele Muccino': 712,
 'Luc Besson': 713,
 'Jason Segel': 714,
 'Maya Forbes': 715,
 'John Grisham': 716,
 'Tim Story': 717,
 'John Requa': 718,
 'Aline Brosh McKenna': 719,
 'Marc Lawrence': 720,
 'Tarsem Singh': 721,
 'Ian Fleming': 722,
 'Jeffrey Reddick': 723,
 'Brian Levant': 724,
 'John J. Strauss': 725,
 'Kurt Wimmer': 726,
 'Richard Marquand': 727,
 'D.J. Caruso': 728,
 'Karen McCullah': 729,
 'Kirsten Smith': 730,
 'Seth Rogen': 731,
 'Daniel Petrie Jr.': 732,
 'Edward Zwick': 733,
 'Kevin Reynolds': 734,
 'Paul Attanasio': 735,
 'Carl Gottlieb': 736,
 'Robert Nelson Jacobs': 737,
 'John McTiernan': 738,
 'Denis Villeneuve': 739,
 'Steven E. de Souza': 740,
 'Guillermo del Toro': 741,
 'Jez Butterworth': 742,
 'P.J. Hogan': 743,
 'Leigh Whannell': 744,
 'Gavin Hood': 745,
 'Alec Sokolow': 746,
 'Antoine Fuqua': 747,
 'Ken Kaufman': 748,
 'David Frankel': 749,
 'Mick Jackson': 750,
 'John Cleese': 751,
 'Jaume Collet-Serra': 752,
 'Will Ferrell': 753,
 'Jim Thomas': 754,
 'Andrew Niccol': 755,
 'Martin Brest': 756,
 'Aaron Sorkin': 757,
 'Adrian Lyne': 758,
 'Larry Charles': 759,
 'Pierre Boulle': 760,
 'David Berenbaum': 761,
 'Dana Fox': 762,
 'William Peter Blatty': 763,
 'Peter Segal': 764,
 'Ang Lee': 765,
 'Richard Donner': 766,
 'Nancy Meyers': 767,
 'Boaz Yakin': 768,
 'Rob Cohen': 769,
 'John Gatins': 770,
 'J.F. Lawton': 771,
 'Shawn Wayans': 772,
 'Andrew Davies': 773,
 'Peter Farrelly': 774,
 'Karey Kirkpatrick': 775,
 'Bruce Joel Rubin': 776,
 'Matt Manfredi': 777,
 'Chuck Russell': 778,
 'Neil Gaiman': 779,
 'Paul Hogan': 780,
 'John Morris': 781,
 'Michael Petroni': 782,
 'Jeffrey Boam': 783,
 'John Moore': 784,
 'Charles Leavitt': 785,
 'Dennis Dugan': 786,
 'Donald E. Stewart': 787,
 'Susannah Grant': 788,
 'Phil Hay': 789,
 'Michael Ferris': 790,
 'Ivan Reitman': 791,
 'Duane Adler': 792,
 'Robert Luketic': 793,
 'Alexander Payne': 794,
 'Andrew Kevin Walker': 795,
 'Eric Heisserer': 796,
 'Adam Shankman': 797,
 'Philip K. Dick': 798,
 "Dan O'Bannon": 799,
 'Jerry Zucker': 800,
 'Tony Scott': 801,
 'Andy Tennant': 802,
 'Peter Laird': 803,
 'Sean Anders': 804,
 'Ben Affleck': 805,
 'Oren Peli': 806,
 'Mark Frost': 807,
 'William Nicholson': 808,
 'Emma Thompson': 809,
 'Anne Fletcher': 810,
 'Eric Roth': 811,
 'Robert Mark Kamen': 812,
 'John Brancato': 813,
 'Adam Cooper': 814,
 'Bill Collage': 815,
 'Gene Roddenberry': 816,
 'Greg Berlanti': 817,
 'John Lee Hancock': 818,
 'Billy Ray': 819,
 'Rich Wilkes': 820,
 'Frank Coraci': 821,
 'Ed Solomon': 822,
 'Graham Yost': 823,
 'Nicholas Stoller': 824,
 'Peter Benchley': 825,
 'Richard Wenk': 826,
 'Betty Thomas': 827,
 'Eddie Murphy': 828,
 'Peter Berg': 829,
 'Seth Gordon': 830,
 'Alvin Sargent': 831,
 'Ronald Shusett': 832,
 'Tim Hill': 833,
 'Marshall Herskovitz': 834,
 'Paul W.S. Anderson': 835,
 'Will Gluck': 836,
 'Alejandro G. Iñárritu': 837,
 'Paul Verhoeven': 838,
 'Robert Rodat': 839,
 'James L. Brooks': 840,
 'Marc Silverstein': 841,
 'Abby Kohn': 842,
 'Damien Chazelle': 843,
 'William Monahan': 844,
 'Craig Pearce': 845,
 'Sylvester Stallone': 846,
 'David Ayer': 847,
 'Hayao Miyazaki': 848,
 'Robert Gordon': 849,
 'J.M. Barrie': 850,
 'Danilo Bach': 851,
 'Tom McCarthy': 852,
 'John Hamburg': 853,
 'Doug Liman': 854,
 'Kenneth Branagh': 855,
 'Alex Proyas': 856,
 'Quentin Tarantino': 857,
 'Steven Zaillian': 858,
 'James V. Hart': 859,
 'Tim Herlihy': 860,
 'Dan Gilroy': 861,
 'Gary Ross': 862,
 'Jack Epps Jr.': 863,
 'Jim Cash': 864,
 'Brad Silberling': 865,
 'Jonathan Mostow': 866,
 'Thomas Harris': 867,
 'James Vanderbilt': 868,
 'Scott Frank': 869,
 'Mike Myers': 870,
 'Chad Hayes': 871,
 'Don Rhymer': 872,
 'Steve Koren': 873,
 'Scott Moore': 874,
 'Tommy Swerdlow': 875,
 'Timothy Dowling': 876,
 'Mike Newell': 877,
 'Scot Armstrong': 878,
 'Steve Conrad': 879,
 'Peter Baynham': 880,
 'Edgar Wright': 881,
 'Steve Oedekerk': 882,
 'Ron Howard': 883,
 'David Benioff': 884,
 'Ridley Scott': 885,
 'Jon M. Chu': 886,
 'Gerry Swallow': 887,
 'Martin Campbell': 888,
 'Adam Sandler': 889,
 'Skip Woods': 890,
 'Adam McKay': 891,
 'Joe Johnston': 892,
 'Wolfgang Petersen': 893,
 'Lorne Cameron': 894,
 'Stephen Sommers': 895,
 'James Mangold': 896,
 'Paul Haggis': 897,
 'Jay Roach': 898,
 'George Nolfi': 899,
 'Bob Gale': 900,
 'Robert Schwentke': 901,
 'Matthew Michael Carnahan': 902,
 'Jonathan Hensleigh': 903,
 'Marc Forster': 904,
 'Baz Luhrmann': 905,
 'John Woo': 906,
 'Jon Lucas': 907,
 'Thomas Lennon': 908,
 'Paul Feig': 909,
 'Simon West': 910,
 'Tom Clancy': 911,
 'Randall Wallace': 912,
 'J. David Stem': 913,
 'Kevin Jarre': 914,
 'Scott Derrickson': 915,
 'Louis Leterrier': 916,
 'Len Wiseman': 917,
 'McG': 918,
 'Mike Werb': 919,
 'Robert Ben Garant': 920,
 'Raja Gosnell': 921,
 'Craig Mazin': 922,
 'Peter Morgan': 923,
 'Tom Shadyac': 924,
 'William Davies': 925,
 'Simon Beaufoy': 926,
 'David Lindsay-Abaire': 927,
 'Chris Columbus': 928,
 'Jay Scherick': 929,
 'Richard Curtis': 930,
 'Dan Fogelman': 931,
 'David N. Weiss': 932,
 'Jon Turteltaub': 933,
 'David Fincher': 934,
 'Brett Ratner': 935,
 'Matt Reeves': 936,
 'Barry Sonnenfeld': 937,
 'Jonathan Liebesman': 938,
 'Tony Gilroy': 939,
 'Robert Zemeckis': 940,
 'Shawn Levy': 941,
 'Shane Black': 942,
 'F. Gary Gray': 943,
 'Sam Raimi': 944,
 'Tim Burton': 945,
 'Joe Shuster': 946,
 'Paul Greengrass': 947,
 'Marianne Wibberley': 948,
 'Mel Gibson': 949,
 'Laeta Kalogridis': 950,
 'Erich Hoeber': 951,
 'Jon Hoeber': 952,
 'Robert Ludlum': 953,
 'David Koepp': 954,
 'Tab Murphy': 955,
 'Akiva Goldsman': 956,
 'William Broyles Jr.': 957,
 'Ruben Fleischer': 958,
 'Lilly Wachowski': 959,
 'Lana Wachowski': 960,
 'Jonathan Goldstein': 961,
 'Alfred Gough': 962,
 'Dean Devlin': 963,
 'Cormac Wibberley': 964,
 'M. Night Shyamalan': 965,
 'Mark Bomback': 966,
 'Lawrence Kasdan': 967,
 'Jeffrey Price': 968,
 'Jerry Siegel': 969,
 'Mike Mitchell': 970,
 'Jan de Bont': 971,
 'John Francis Daley': 972,
 'Brad Peyton': 973,
 'Bill Condon': 974,
 'John Logan': 975,
 'Matthew Vaughn': 976,
 'Alfonso Cuarón': 977,
 'Peyton Reed': 978,
 'Mark Burton': 979,
 'Christopher Miller': 980,
 'Drew Goddard': 981,
 'Darren Lemke': 982,
 'Ron Clements': 983,
 'George Clayton Johnson': 984,
 'Charles Perrault': 985,
 'Jane Goldman': 986,
 'John August': 987,
 'Kelly Asbury': 988,
 'Michael McCullers': 989,
 'Phil Lord': 990,
 'Scott Silver': 991,
 'David S. Goyer': 992,
 'Chris Weitz': 993,
 'Peter S. Seaman': 994,
 'Jake Kasdan': 995,
 'Guy Ritchie': 996,
 'Michael Green': 997,
 'Marc Webb': 998,
 'Etan Cohen': 999,
 'Steven Spielberg': 1000,
 ...}


len(crewRank)

1091


import random


def getCrewsTeamRank(crews):
    total = 0
    for crew in crews:
        if crew not in crewRank.keys():
            total += random.randint(1, 100)
            continue
        total += crewRank[crew]
    return total


data2 = data.copy()
data2['crewsTeamRank'] = data2['Crew'].apply(getCrewsTeamRank)
data2['crewsTeamRank'].value_counts()

260     41
74      30
132     30
73      29
102     29
        ..
2315     1
1239     1
637      1
1400     1
2606     1
Name: crewsTeamRank, Length: 2110, dtype: int64


corr = pearsonr(data2['crewsTeamRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between crewsTeamRank and Gross: %.3f' % corr)
data2.plot.scatter(x='crewsTeamRank', y='Gross_worldwide', color='brown')

Pearsons correlation between crewsTeamRank and Gross: 0.550

<AxesSubplot:xlabel='crewsTeamRank', ylabel='Gross_worldwide'>


final['crewsTeamRank'] = data2['crewsTeamRank']


releases4crew.sort_values(by='Mean', ascending=False, inplace=True)
top150Crew = list(releases4crew['Crew'][0:150])


def getNumTopCrew(crews):
    total = 0
    for crew in crews:
        if crew in top150Crew:
            total += 1
    return total


data2 = data.copy()
data2['NumTopCrew'] = data2['Crew'].apply(getNumTopCrew)
data2['NumTopCrew'].value_counts()

0    7911
1     555
2     187
3      76
4      21
5       2
Name: NumTopCrew, dtype: int64


corr = pearsonr(data2['NumTopCrew'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumTopCrew and Gross: %.3f' % corr)
data2.plot.scatter(x='NumTopCrew', y='Gross_worldwide', color='brown')

Pearsons correlation between NumTopCrew and Gross: 0.621

<AxesSubplot:xlabel='NumTopCrew', ylabel='Gross_worldwide'>


final['NumTopCrew'] = data2['NumTopCrew']


releases4crew.sort_values(by='Mean', ascending=False, inplace=True)
top50Crew = list(releases4crew['Crew'][0:50])


def getHasTopCrew(crews):
    for crew in crews:
        if crew in top50Crew:
            return 1
    return 0


data2 = data.copy()
data2['HasTopCrew'] = data2['Crew'].apply(getHasTopCrew)
data2['HasTopCrew'].value_counts()

0    8480
1     272
Name: HasTopCrew, dtype: int64


corr = pearsonr(data2['HasTopCrew'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between HasTopCrew and Gross: %.3f' % corr)
data2.plot.scatter(x='HasTopCrew', y='Gross_worldwide', color='brown')

Pearsons correlation between HasTopCrew and Gross: 0.544

<AxesSubplot:xlabel='HasTopCrew', ylabel='Gross_worldwide'>


final['HasTopCrew'] = data2['HasTopCrew']


data2 = data.copy()
data2['NumStudios'] = data2['Studios'].apply(lambda x: len(x))
data2['NumStudios'].value_counts()

3    5307
2    2088
1    1291
0      41
4      22
5       3
Name: NumStudios, dtype: int64


data2['NumStudios'].value_counts().reset_index().sort_values(by='index', ascending=True).plot.bar(x='index',
                                                                                                  y='NumStudios')
corr = pearsonr(data2['NumStudios'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumStudios and Gross: %.3f' % corr)
data2.plot.scatter(x='NumStudios', y='Gross_worldwide', color='purple')

Pearsons correlation between NumStudios and Gross: 0.133

<AxesSubplot:xlabel='NumStudios', ylabel='Gross_worldwide'>


final['NumStudios'] = data2['NumStudios']


data3 = data2.drop(list(data[data2['NumStudios'] == 4].index), axis=0)
data3 = data3.drop(list(data[data2['NumStudios'] == 5].index), axis=0)


data3['NumStudios'].value_counts().reset_index().sort_values(by='index', ascending=True).plot.bar(x='index',
                                                                                                  y='NumStudios')
corr = pearsonr(data3['NumStudios'], data3['Gross_worldwide'])[0]
print('Pearsons correlation between NumStudios and Gross: %.3f' % corr)
data3.plot.scatter(x='NumStudios', y='Gross_worldwide', color='purple')

Pearsons correlation between NumStudios and Gross: 0.135

<AxesSubplot:xlabel='NumStudios', ylabel='Gross_worldwide'>


studio = parseWithMoneyAndCount(data, 'Studios')
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = studio.sort_values(by='Count', ascending=False)[0:15]
plt.bar(data=data2, x='Studios', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Number Of Movies", fontsize=10)
plt.title("Studio and Number Of Movies they work for", fontsize=15)
plt.show()

fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = studio.sort_values(by='Total', ascending=False)[0:15]
plt.bar(data=data2, x='Studios', height='Total', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Total Gross", fontsize=10)
plt.title("Studio and Total Gross of Movies they work for", fontsize=15)
plt.show()
# plt.subplot(2,1,2)
fig = plt.figure(figsize=(8, 4))
data3 = studio.sort_values(by='Mean', ascending=False)[0:15]
plt.bar(data=data3, x='Studios', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Studio and Average Gross per movie they work for", fontsize=15)
plt.xlabel("Studio", fontsize=10)
plt.ylabel("Average Gross", fontsize=10)
plt.show()

fig = plt.figure(figsize=(8, 4))
data3 = studio.sort_values(by='Median', ascending=False)[0:15]
plt.bar(data=data3, x='Studios', height='Median', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Studio and Average Gross per movie they work for", fontsize=15)
plt.xlabel("Studio", fontsize=10)
plt.ylabel("Average Gross", fontsize=10)
plt.show()


studio.sort_values(by='Mean', ascending=False)


studios10Larger = studio[studio['Count'] > 5]
studios10Larger = studios10Larger.sort_values(by='Mean', ascending=False)
fig = plt.figure(figsize=(8, 4))
plt.bar(data=studios10Larger[0:25], x='Studios', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Studio and Average Gross per movie they work for", fontsize=15)
plt.xlabel("Studio", fontsize=10)
plt.ylabel("Average Gross", fontsize=10)
plt.show()
studios10Larger


studios10Larger.sort_values(by='Mean', ascending=True, inplace=True)
studioRank = dict()
for i, row in enumerate(studios10Larger['Studios']):
    studioRank[row] = i + 1
studioRank

{'Concorde Pictures': 1,
 'La Sept Cinéma': 2,
 'Ontario Film Development Corporation': 3,
 'National Film Board of Canada (NFB)': 4,
 'Zenith Entertainment': 5,
 'Duplass Brothers Productions': 6,
 'A&E IndieFilms': 7,
 'British Screen Productions': 8,
 'RAI Radiotelevisione Italiana': 9,
 'American Playhouse': 10,
 'Films A2': 11,
 'Sovereign Pictures': 12,
 'Lorimar Motion Pictures': 13,
 'Arte France Cinéma': 14,
 'Diamond Docs': 15,
 'MK2 Productions': 16,
 'Arts Council of England': 17,
 'Road Movies Filmproduktion': 18,
 'The Rank Organisation': 19,
 'October Films': 20,
 'Alliance Communications Corporation': 21,
 'Live Entertainment': 22,
 'Les Films Ariane': 23,
 'Trimark Pictures': 24,
 'Triumph Films': 25,
 'CBS Theatrical Films': 26,
 'Big Indie Pictures': 27,
 'Alliance Entertainment': 28,
 'Fine Line Features': 29,
 'Cineplex Odeon Films': 30,
 'Les Films Alain Sarde': 31,
 'Gener8Xion Entertainment': 32,
 'Téléfilm Canada': 33,
 'Rhombus Media': 34,
 'Canadian Film Development Corporation (CFDC)': 35,
 'CiBy 2000': 36,
 'Vision PDG': 37,
 'Lorimar Productions': 38,
 'Merchant Ivory Productions': 39,
 'Destination Films': 40,
 'Balcor Film Investors': 41,
 'Renaissance Films': 42,
 'Celluloid Dreams': 43,
 'New World Pictures': 44,
 'Alliance Atlantis Communications': 45,
 'Weintraub Entertainment Group': 46,
 'Atlantic Entertainment Group': 47,
 'Danmarks Radio (DR)': 48,
 'Silver Screen Partners': 49,
 'The Samuel Goldwyn Company': 50,
 'Killer Films': 51,
 'Avenue Pictures': 52,
 'DD Productions': 53,
 'Golan-Globus Productions': 54,
 'De Laurentiis Entertainment Group (DEG)': 55,
 'Bandai Visual Company': 56,
 'CG Cinéma': 57,
 'Det Danske Filminstitut': 58,
 'Island Pictures': 59,
 'Savoy Pictures': 60,
 'Sony Pictures Classics': 61,
 'Worldview Entertainment': 62,
 'Film Victoria': 63,
 'Price Entertainment': 64,
 'Shree Ashtavinayak Cine Vision': 65,
 'Cinémaginaire Inc.': 66,
 'Hart Sharp Entertainment': 67,
 'Cannon Films': 68,
 'The Cannon Group': 69,
 'Recorded Picture Company (RPC)': 70,
 'The Australian Film Commission': 71,
 'France 3 Cinéma': 72,
 'A&M Films': 73,
 'Delphi III Productions': 74,
 'Kings Road Entertainment': 75,
 'Muse Productions': 76,
 'Serendipity Point Films': 77,
 'The Mirisch Corporation': 78,
 'Broad Green Pictures': 79,
 'British Film Institute (BFI)': 80,
 'Element Pictures': 81,
 'Alive Films': 82,
 'Distant Horizon': 83,
 'Capitol Films': 84,
 'Les Films du Losange': 85,
 'Sweetland Films': 86,
 'ML Delphi Premier Productions': 87,
 'Amazon Studios': 88,
 'FilmFour': 89,
 'Good Machine': 90,
 'Viacom18 Motion Pictures': 91,
 'Likely Story': 92,
 'Palace Pictures': 93,
 'Gladden Entertainment': 94,
 'Australian Film Finance Corporation (AFFC)': 95,
 'Bavaria Film': 96,
 'Rai Cinema': 97,
 'Island World': 98,
 'Excel Entertainment': 99,
 'Isle of Man Film': 100,
 'Artisan Entertainment': 101,
 'Eros Worldwide': 102,
 'MDP Worldwide': 103,
 'John Wells Productions': 104,
 'British Broadcasting Corporation (BBC)': 105,
 'Producers Sales Organization (PSO)': 106,
 'Element Films': 107,
 'Detour Filmproduction': 108,
 'Empire Pictures': 109,
 'Epic Productions': 110,
 'Embassy Pictures': 111,
 'HanWay Films': 112,
 'Pandora Filmproduktion': 113,
 'Orly Films': 114,
 'Nelvana': 115,
 'TAFT Entertainment Pictures': 116,
 'Screen Ireland': 117,
 'The Bubble Factory': 118,
 'IFC Productions': 119,
 'Renn Productions': 120,
 'Bórd Scannán na hÉireann': 121,
 'Automatik Entertainment': 122,
 'Redchillies.VFX': 123,
 "Centre national du cinéma et de l'image animée (CNC)": 124,
 'Fandango': 125,
 'ApolloMedia Distribution': 126,
 'Goldcrest Films International': 127,
 'Haut et Court': 128,
 'ARTE': 129,
 'Channel Four Films': 130,
 'Cinerenta Medienbeteiligungs KG': 131,
 'Brightlight Pictures': 132,
 'Why Not Productions': 133,
 'Propaganda Films': 134,
 'Warner Independent Pictures (WIP)': 135,
 'Nadiadwala Grandson Entertainment': 136,
 'GreeneStreet Films': 137,
 'France 2 Cinéma': 138,
 'Fox STAR Studios': 139,
 'Dharma Productions': 140,
 'Red Chillies Entertainment': 141,
 'Hemdale': 142,
 'Kingsgate Films': 143,
 'Enigma Productions': 144,
 'Spelling Films': 145,
 'Zentropa Entertainments': 146,
 'Jack Rollins & Charles H. Joffe Productions': 147,
 'Marty Katz Productions': 148,
 'Depth of Field': 149,
 'Lorimar Film Entertainment': 150,
 'Incorporated Television Company (ITC)': 151,
 'Sean S. Cunningham Films': 152,
 'Film i Väst': 153,
 'HandMade Films': 154,
 'ABC Motion Pictures': 155,
 'Provident Films': 156,
 'Overture Films': 157,
 '2929 Productions': 158,
 'Groundswell Productions': 159,
 'RKO Pictures': 160,
 'LD Entertainment': 161,
 'Brooksfilms': 162,
 'Vestron Pictures': 163,
 'A24': 164,
 'Big Beach Films': 165,
 'Broadway Pictures': 166,
 'Fox Atomic': 167,
 'Delphi V Productions': 168,
 'This Is That Productions': 169,
 'Largo Entertainment': 170,
 'Industry Entertainment': 171,
 "Hell's Kitchen Films": 172,
 'Medusa Film': 173,
 'Hawn / Sylbert Movie Company': 174,
 'View Askew Productions': 175,
 'First Artists': 176,
 '40 Acres & A Mule Filmworks': 177,
 'Rysher Entertainment': 178,
 'Orion Pictures': 179,
 'Jackson/McHenry Company': 180,
 'The': 181,
 'H2L Media Group': 182,
 'Franchise Pictures': 183,
 'Pressman Film': 184,
 'JVC Entertainment Networks': 185,
 'Canal+ España': 186,
 'Yash Raj Films': 187,
 'Turner Pictures (I)': 188,
 'IAC Films': 189,
 'El Deseo': 190,
 'Sidney Kimmel Entertainment': 191,
 'UTV Motion Pictures': 192,
 'Delphi II Productions': 193,
 'IM Global': 194,
 'BBC Films': 195,
 'Gaumont': 196,
 'Delphi IV Productions': 197,
 'Capella International': 198,
 'Smart Egg Pictures': 199,
 'Brillstein-Grey Entertainment': 200,
 'Trilogy Entertainment Group': 201,
 'RadicalMedia': 202,
 'Art Linson Productions': 203,
 'Westdeutscher Rundfunk (WDR)': 204,
 'Nelson Entertainment': 205,
 'New Deal Productions': 206,
 'The Geffen Company': 207,
 'Amercent Films': 208,
 'Bob Yari Productions': 209,
 'X-Filme Creative Pool': 210,
 'Zoetrope Studios': 211,
 'Film Council': 212,
 'Gaylord Films': 213,
 'Touchwood Pacific Partners 1': 214,
 'Bona Fide Productions': 215,
 'National Lampoon': 216,
 'SLM Production Group': 217,
 'Permut Presentations': 218,
 'Intrepid Pictures': 219,
 'Silver Screen Partners II': 220,
 'Pantelion Films': 221,
 'Pathé Pictures International': 222,
 'Turman-Foster Company': 223,
 'Screen Australia': 224,
 'Film4': 225,
 'Sandollar Productions': 226,
 'The Malpaso Company': 227,
 'Ghoulardi Film Company': 228,
 'Vinod Chopra Productions': 229,
 'Waypoint Entertainment': 230,
 'Bold Films': 231,
 'Samuel Goldwyn Films': 232,
 'Rastar Pictures': 233,
 'UK Film Council': 234,
 'Gramercy Pictures (I)': 235,
 'American Zoetrope': 236,
 'FilmEngine': 237,
 'Wild Bunch': 238,
 'MWM Studios': 239,
 'Morgan Creek Entertainment': 240,
 'The Ladd Company': 241,
 'Lions Gate Films': 242,
 'Indian Paintbrush': 243,
 'Affirm Films': 244,
 'Dino De Laurentiis Company': 245,
 'Robert Evans Company': 246,
 'Cheyenne Enterprises': 247,
 'Caravan Pictures': 248,
 'Cecchi Gori Group Tiger Cinematografica': 249,
 'Melvin Simon Productions': 250,
 'Anonymous Content': 251,
 'BRON Studios': 252,
 'Imagenation Abu Dhabi FZ': 253,
 'Disneynature': 254,
 'IndieProd Company Productions': 255,
 'Winkler Films': 256,
 'Exclusive Media Group': 257,
 "Cinema '84": 258,
 'Mr. Mudd': 259,
 'Broadway Video': 260,
 'DNA Films': 261,
 'Rogue Pictures': 262,
 'Hyde Park Entertainment': 263,
 'EMI Films': 264,
 'Ruby Films': 265,
 'Fox Searchlight Pictures': 266,
 'Canal+': 267,
 'Clinica Estetico': 268,
 'HBO Films': 269,
 'Rastar Films': 270,
 'Outlaw Productions (I)': 271,
 'Mad Chance': 272,
 'Gran Via Productions': 273,
 'Section Eight': 274,
 'FilmNation Entertainment': 275,
 'Open Road Films (II)': 276,
 'Hollywood Pictures': 277,
 'Motion Picture Corporation of America (MPCA)': 278,
 'Endgame Entertainment': 279,
 'Voltage Pictures': 280,
 'Epsilon Motion Pictures': 281,
 'Wildwood Enterprises': 282,
 'Les Productions Artistes Associés': 283,
 'State Street Pictures': 284,
 'Odyssey Entertainment': 285,
 'Stage 6 Films': 286,
 'Perdido Productions': 287,
 'Mandalay Entertainment': 288,
 'Miramax': 289,
 'Geffen Pictures': 290,
 'Interscope Communications': 291,
 'Cinema Group Ventures': 292,
 'DiNovi Pictures': 293,
 'American Empirical Pictures': 294,
 'Participant': 295,
 'Pathé': 296,
 'Alcor Films': 297,
 'Davis-Films': 298,
 'Dark Castle Entertainment': 299,
 'Tyler Perry Studios': 300,
 'Gravier Productions': 301,
 'David Foster Productions': 302,
 'Estudios Churubusco Azteca S.A.': 303,
 'Black Bear Pictures': 304,
 'MTV Films': 305,
 'Castle Rock Entertainment': 306,
 'Beacon Communications': 307,
 'Sprockets Music': 308,
 'Barwood Films': 309,
 'Gran Via': 310,
 'Tapestry Films': 311,
 'Polygram Filmed Entertainment': 312,
 'CBS Films': 313,
 'Lakeshore Entertainment': 314,
 'Toei Company': 315,
 'The Tyler Perry Company': 316,
 'Spring Creek Productions': 317,
 'Bel Air Entertainment': 318,
 'Paramount Vantage': 319,
 'Aamir Khan Productions': 320,
 'TriStar Pictures': 321,
 'Hanna-Barbera Productions': 322,
 'The Montecito Picture Company': 323,
 'Misher Films': 324,
 'Major Studio Partners': 325,
 'Rainforest Films': 326,
 'Laurence Mark Productions': 327,
 'Baltimore Pictures': 328,
 'Beacon Pictures': 329,
 'Phoenix Pictures': 330,
 'Bazelevs Production': 331,
 'Saturn Films': 332,
 'Screen Gems': 333,
 'St. Petersburg Clearwater Film Commission': 334,
 'Intermedia Films': 335,
 'Ixtlan': 336,
 'Walt Disney Productions': 337,
 'Lawrence Bender Productions': 338,
 'Robert Simonds Productions': 339,
 'United Artists': 340,
 'Focus Features': 341,
 'Dimension Films': 342,
 'River Road Entertainment': 343,
 'Pariah': 344,
 'Cube Vision': 345,
 'PolyGram Filmed Entertainment': 346,
 'Jersey Films': 347,
 'Emmett/Furla/Oasis Films (EFO Films)': 348,
 'Double Feature Films': 349,
 'Warner Bros. Animation': 350,
 'Disney Television Animation': 351,
 'Entertainment One': 352,
 'Carolco Pictures': 353,
 'Star Partners II Ltd.': 354,
 'Lawrence Gordon Productions': 355,
 'Zucker Brothers Productions': 356,
 '3 Arts Entertainment': 357,
 'Tollin/Robbins Productions': 358,
 'Playtone': 359,
 '4 Kids Entertainment': 360,
 'Delphi Films': 361,
 'Face Productions': 362,
 'Imagine Films Entertainment': 363,
 'Newmarket Capital Group': 364,
 'Shangri-La Entertainment': 365,
 'Conundrum Entertainment': 366,
 'Golden Harvest Company': 367,
 'Cinergi Pictures Entertainment': 368,
 'TF1 Films Production': 369,
 'Amen Ra Films': 370,
 'Home Box Office (HBO)': 371,
 'Mandate Pictures': 372,
 'Icon Entertainment International': 373,
 'The Guber-Peters Company': 374,
 'CJ Entertainment': 375,
 'Strike Entertainment': 376,
 'Mirage Enterprises': 377,
 'Tribeca Productions': 378,
 'Plan B Entertainment': 379,
 'Silver Screen Partners IV': 380,
 'Silver Screen Partners III': 381,
 'The Weinstein Company': 382,
 'Mandalay Pictures': 383,
 'Mayhem Pictures': 384,
 'FilmColony': 385,
 'Alcon Entertainment': 386,
 'Allied Filmmakers': 387,
 'STX Entertainment': 388,
 'StudioCanal': 389,
 'Huayi Brothers Media': 390,
 'Will Packer Productions': 391,
 'Annapurna Pictures': 392,
 'China Film Co-Production Corporation': 393,
 'Touchstone Pictures': 394,
 'Working Title Films': 395,
 'DENTSU Music And Entertainment': 396,
 'Dune Entertainment III': 397,
 'FilmDistrict': 398,
 'Gerber Pictures': 399,
 'Film Workshop': 400,
 'Warner Bros. Family Entertainment': 401,
 'The Mark Gordon Company': 402,
 'Metro-Goldwyn-Mayer (MGM)': 403,
 'Dark Horse Entertainment': 404,
 'Global Entertainment Productions GmbH & Company Medien KG': 405,
 'Malpaso Productions': 406,
 'Pacific Western': 407,
 'Millennium Films': 408,
 'Goldcrest Pictures': 409,
 'Craven-Maddalena Films': 410,
 'Avnet/Kerner Productions': 411,
 'Majestic Films International': 412,
 'Revolution Studios': 413,
 'Contrafilm': 414,
 "Mel's Cite du Cinema": 415,
 'Initial Entertainment Group (IEG)': 416,
 'Constantin Film': 417,
 'Brad Grey Pictures': 418,
 'Cold Spring Pictures': 419,
 'Karz Entertainment': 420,
 'Icon Productions': 421,
 'Mutual Film Company': 422,
 'Anton': 423,
 'EuropaCorp': 424,
 'Gold Circle Films': 425,
 'Avery Pix': 426,
 'Good Universe': 427,
 'Scott Rudin Productions': 428,
 'Smokehouse Pictures': 429,
 'Ghost House Pictures': 430,
 'Laika Entertainment': 431,
 'Hughes Entertainment': 432,
 'Blumhouse Productions': 433,
 'Appian Way': 434,
 'Canadian Film or Video Production Tax Credit (CPTC)': 435,
 'Escape Artists': 436,
 'Broken Road Productions': 437,
 'Marc Platt Productions': 438,
 'Point Grey Pictures': 439,
 'Tall Trees Productions': 440,
 'Disneytoon Studios': 441,
 'Chartoff-Winkler Productions': 442,
 'BenderSpink': 443,
 'Bluegrass Films': 444,
 'Red Hour Films': 445,
 'Black Label Media': 446,
 'Cross Creek Pictures': 447,
 'Mace Neufeld Productions': 448,
 'Davis Entertainment': 449,
 'Juno Pix': 450,
 'Radar Pictures': 451,
 'Lionsgate': 452,
 'New Line Cinema': 453,
 'Apatow Productions': 454,
 'New Regency Productions': 455,
 'A Band Apart': 456,
 'Gordon Company': 457,
 'Konrad Pictures': 458,
 'Alphaville Films': 459,
 'Twisted Pictures': 460,
 'QED International': 461,
 'De Line Pictures': 462,
 'Kennedy Miller Productions': 463,
 '21 Laps Entertainment': 464,
 'NPV Entertainment': 465,
 'Imagine Entertainment': 466,
 'Fox 2000 Pictures': 467,
 'Offspring Entertainment': 468,
 'Northern Lights Entertainment': 469,
 'Protozoa Pictures': 470,
 'Donner/Shuler-Donner Productions': 471,
 'Reliance Entertainment': 472,
 'Gary Sanchez Productions': 473,
 'Scott Free Productions': 474,
 'Flower Films (II)': 475,
 'Paramount Pictures': 476,
 'TIK Films': 477,
 'Thunder Road Pictures': 478,
 'Spyglass Entertainment': 479,
 'Columbia Pictures': 480,
 'Studio Ghibli': 481,
 'Silver Pictures': 482,
 'Happy Madison Productions': 483,
 'Universal Pictures': 484,
 'The Zanuck Company': 485,
 'Forward Pass': 486,
 'Mosaic': 487,
 'Gracie Films': 488,
 'Nickelodeon Movies': 489,
 'Roger Birnbaum Productions': 490,
 'Summit Entertainment': 491,
 'Relativity Media': 492,
 'Gunn Films': 493,
 'Walden Media': 494,
 'Twentieth Century Fox': 495,
 'Warner Bros.': 496,
 'Eddie Murphy Productions': 497,
 'Red Wagon Entertainment': 498,
 'Red Granite Pictures': 499,
 'Nippon Television Network (NTV)': 500,
 'Jerry Weintraub Productions': 501,
 'Platinum Dunes': 502,
 'Marv Films': 503,
 'Brandywine Productions': 504,
 'Troublemaker Studios': 505,
 'Ingenious Film Partners': 506,
 'Aardman Animations': 507,
 'Virtual Studios': 508,
 'Solana Films': 509,
 'Tig Productions': 510,
 'Mandeville Films': 511,
 'Sony Pictures Entertainment (SPE)': 512,
 'RatPac Entertainment': 513,
 'Valhalla Motion Pictures': 514,
 'Dreamworks Pictures': 515,
 'Zanuck/Brown Productions': 516,
 'Tim Burton Productions': 517,
 'Perfect World Pictures': 518,
 'Zide-Perry Productions': 519,
 'The Bedford Falls Company': 520,
 "Donners' Company": 521,
 'Team Todd': 522,
 'ImageMovers': 523,
 'Kopelson Entertainment': 524,
 'Original Film': 525,
 'Overbrook Entertainment': 526,
 'Parkes/MacDonald Image Nation': 527,
 'M6 Films': 528,
 'Ingenious Media': 529,
 'LStar Capital': 530,
 'Hurwitz Creative': 531,
 'Eon Productions': 532,
 'Prime Focus': 533,
 'Digital Image Associates': 534,
 'Michael De Luca Productions': 535,
 'Vertigo Entertainment': 536,
 'Village Roadshow Pictures': 537,
 'Cruise/Wagner Productions': 538,
 'Di Bonaventura Pictures': 539,
 'GK Films': 540,
 'DMG Entertainment': 541,
 'TSG Entertainment': 542,
 'Don Simpson/Jerry Bruckheimer Films': 543,
 'Grive Productions': 544,
 'RatPac-Dune Entertainment': 545,
 'Dentsu': 546,
 'Chernin Entertainment': 547,
 'The Kennedy/Marshall Company': 548,
 'Media Rights Capital (MRC)': 549,
 'Amblin Entertainment': 550,
 'Blinding Edge Pictures': 551,
 'Temple Hill Entertainment': 552,
 'Sony Pictures Animation': 553,
 'Dune Entertainment': 554,
 'Bona Film Group': 555,
 'Atlas Entertainment': 556,
 'Green Hat Films': 557,
 'Studio Babelsberg': 558,
 'Walt Disney Animation Studios': 559,
 'The Safran Company': 560,
 'DreamWorks': 561,
 '1492 Pictures': 562,
 'Skydance Media': 563,
 'Sunswept Entertainment': 564,
 'Maverick Films': 565,
 'Color Force': 566,
 'Walt Disney Pictures': 567,
 'Alibaba Pictures': 568,
 'Jerry Bruckheimer Films': 569,
 'FortyFour Studios': 570,
 'Legendary Entertainment': 571,
 'Roth Films': 572,
 'Animal Logic': 573,
 'Twentieth Century Fox Animation': 574,
 'The Saul Zaentz Company': 575,
 'Marvel Enterprises': 576,
 'Centropolis Entertainment': 577,
 'DreamWorks Animation': 578,
 'DC Entertainment': 579,
 'Bad Robot': 580,
 'China Film Group Corporation (CFGC)': 581,
 'Blue Sky Studios': 582,
 'Hasbro': 583,
 'Danjaq': 584,
 'Lucasfilm': 585,
 'Pacific Data Images (PDI)': 586,
 'Marvel Entertainment': 587,
 'WingNut Films': 588,
 'Laura Ziskin Productions': 589,
 'Heyday Films': 590,
 'Syncopy': 591,
 'Pixar Animation Studios': 592,
 'Illumination Entertainment': 593,
 'Lightstorm Entertainment': 594,
 'Marvel Studios': 595}


import random


def getStudioRank(studios):
    max = 0
    for studio in studios:
        if studio not in studioRank.keys():
            continue
        if studioRank[studio] > max:
            max = studioRank[studio]
    if max != 0:
        return max
    return random.randint(1, 200)


data2 = data.copy()
data2['StudioRank'] = data2['Studios'].apply(getStudioRank)
corr = pearsonr(data2['StudioRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between StudioRank and Gross: %.3f' % corr)
data2.plot.scatter(x='StudioRank', y='Gross_worldwide', color='blue')

Pearsons correlation between StudioRank and Gross: 0.392

<AxesSubplot:xlabel='StudioRank', ylabel='Gross_worldwide'>


import random


def getStudioRank(studios):
    total = 0
    for studio in studios:
        if studio not in studioRank.keys():
            total += random.randint(1, 200)
            continue
        total += studioRank[studio]
    return total


data2 = data.copy()
data2['StudioRank'] = data2['Studios'].apply(getStudioRank)
corr = pearsonr(data2['StudioRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between StudioRank and Gross: %.3f' % corr)
data2.plot.scatter(x='StudioRank', y='Gross_worldwide', color='blue')

Pearsons correlation between StudioRank and Gross: 0.473

<AxesSubplot:xlabel='StudioRank', ylabel='Gross_worldwide'>


final['StudioRank'] = data2['StudioRank']


import random


def getStudioRank(studios):
    total = 0
    length = len(studios)
    if length == 0:
        length = 1
    for studio in studios:
        if studio not in studioRank.keys():
            total += random.randint(0, 200)
            continue
        total += studioRank[studio]
    return total / length


data2 = data.copy()
data2['StudioRank'] = data2['Studios'].apply(getStudioRank)
corr = pearsonr(data2['StudioRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between StudioRank and Gross: %.3f' % corr)
data2.plot.scatter(x='StudioRank', y='Gross_worldwide', color='blue')

Pearsons correlation between StudioRank and Gross: 0.433

<AxesSubplot:xlabel='StudioRank', ylabel='Gross_worldwide'>


studios10Larger.sort_values(by='Mean', ascending=False, inplace=True)
top100Studios = list(studios10Larger['Studios'][0:100])


def getNumTopStudios(studios):
    total = 0
    for studio in studios:
        if studio in top100Studios:
            total += 1
    return total


data2 = data.copy()
data2['NumTopStudios'] = data2['Studios'].apply(getNumTopStudios)
data2['NumTopStudios'].value_counts()

0    7009
1    1248
2     433
3      62
Name: NumTopStudios, dtype: int64


corr = pearsonr(data2['NumTopStudios'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumTopStudios and Gross: %.3f' % corr)
data2.plot.scatter(x='NumTopStudios', y='Gross_worldwide', color='brown')

Pearsons correlation between NumTopStudios and Gross: 0.530

<AxesSubplot:xlabel='NumTopStudios', ylabel='Gross_worldwide'>


final['NumTopStudios'] = data2['NumTopStudios']


studios10Larger.sort_values(by='Mean', ascending=False, inplace=True)
top100Studios = list(studios10Larger['Studios'][0:30])


def getHasTopStudio(studios):
    for studio in studios:
        if studio in top100Studios:
            return 1
    return 0


data2 = data.copy()
data2['HasTopStudio'] = data2['Studios'].apply(getHasTopStudio)
data2['HasTopStudio'].value_counts()
corr = pearsonr(data2['HasTopStudio'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between HasTopStudio and Gross: %.3f' % corr)
data2.plot.scatter(x='HasTopStudio', y='Gross_worldwide', color='brown')

Pearsons correlation between HasTopStudio and Gross: 0.504

<AxesSubplot:xlabel='HasTopStudio', ylabel='Gross_worldwide'>


final['HasTopStudio'] = data2['HasTopStudio']


data2 = data.copy()
data2['Countries'] = data2['Countries'].apply(lambda x: len(x))
data2['Countries'].value_counts()

1     5743
2     1873
3      676
4      269
5      114
6       45
7       10
8       10
0        5
9        4
11       2
19       1
Name: Countries, dtype: int64


corr = pearsonr(data2['Countries'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between Countries and Gross: %.3f' % corr)
data2.plot.scatter(x='Countries', y='Gross_worldwide', color='brown')

Pearsons correlation between Countries and Gross: 0.087

<AxesSubplot:xlabel='Countries', ylabel='Gross_worldwide'>


country = parseWithMoneyAndCount(data, 'Countries')
# country=country[country['Count']>20]
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = country.sort_values(by='Count', ascending=False)[0:20]
plt.bar(data=data2, x='Countries', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Releases", fontsize=20)
plt.title("Movie Releases By Country", fontsize=20)
fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,2)
data2 = country.sort_values(by='Total', ascending=False)[0:20]
plt.bar(data=data2, x='Countries', height='Total', color="salmon")
plt.ylabel("Total Gross", fontsize=20)
plt.xticks(rotation=90, fontsize=10)
plt.xlabel("Countries", fontsize=20)
plt.title("Total Gross By Country", fontsize=20)

fig = plt.figure(figsize=(8, 4))
# plt.subplot(2,1,1)
data2 = country.sort_values(by='Mean', ascending=False)[0:20]
plt.bar(data=data2, x='Countries', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.ylabel("Average Gross", fontsize=20)
plt.title("Average Gross By Country", fontsize=20)
plt.show()
## This use for QUESTION 8
list_country = list(country['Countries'])


country.sort_values(by='Mean', ascending=False)


data2 = data.copy()
data2['isUnitedStates'] = data2['Countries'].apply(lambda x: 1 if "United States" in x else 0)
corr = pearsonr(data2['isUnitedStates'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between isUnitedStates and Gross: %.3f' % corr)
data2.plot.scatter(x='isUnitedStates', y='Gross_worldwide', color='brown')

Pearsons correlation between isUnitedStates and Gross: 0.129

<AxesSubplot:xlabel='isUnitedStates', ylabel='Gross_worldwide'>


final['isUnitedStates'] = data2['isUnitedStates']


country.sort_values(by='Mean', ascending=False)
release100Countries = country[country['Count'] >= 100]
release100Countries = release100Countries.sort_values(by='Mean', ascending=False)
release100Countries.plot.bar(x='Countries', y='Mean', color='green')

<AxesSubplot:xlabel='Countries'>


release100Countries = release100Countries.sort_values(by='Mean', ascending=True).reset_index(drop=True)
countryRank = dict()
for i, row in enumerate(release100Countries['Countries']):
    countryRank[row] = i + 1  ## Plus 1 in order to release the 0 position for another film
countryRank

{'India': 1,
 'Italy': 2,
 'France': 3,
 'Spain': 4,
 'Germany': 5,
 'United States': 6,
 'Canada': 7,
 'United Kingdom': 8,
 'Mexico': 9,
 'Hong Kong': 10,
 'Japan': 11,
 'Australia': 12,
 'China': 13}


def getCountryRank(countries):
    max = 0
    for country in countries:
        if country not in countryRank.keys():
            continue
        if countryRank[country] > max:
            max = countryRank[country]
    if max == 0:
        max = random.randint(1, 10)
    return max


data2 = data.copy()
data2['CountryRank'] = data2['Countries'].apply(getCountryRank)
corr = pearsonr(data2['CountryRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CountryRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CountryRank', y='Gross_worldwide', color='blue')

Pearsons correlation between CountryRank and Gross: 0.153

<AxesSubplot:xlabel='CountryRank', ylabel='Gross_worldwide'>


final['CountryRank'] = data2['CountryRank']


def getCountryRank(countries):
    total = 0
    for country in countries:
        if country not in countryRank.keys():
            continue
        total += countryRank[country]
    return total


data2 = data.copy()
data2['CountryRank'] = data2['Countries'].apply(getCountryRank)
corr = pearsonr(data2['CountryRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CountryRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CountryRank', y='Gross_worldwide', color='blue')

Pearsons correlation between CountryRank and Gross: 0.161

<AxesSubplot:xlabel='CountryRank', ylabel='Gross_worldwide'>


def getCountryRank(countries):
    for country in countries:
        if country not in countryRank.keys():
            continue
        return 1
    return 0


data2 = data.copy()
data2['CountryRank'] = data2['Countries'].apply(getCountryRank)
corr = pearsonr(data2['CountryRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between CountryRank and Gross: %.3f' % corr)
data2.plot.scatter(x='CountryRank', y='Gross_worldwide', color='blue')

Pearsons correlation between CountryRank and Gross: 0.034

<AxesSubplot:xlabel='CountryRank', ylabel='Gross_worldwide'>


language = parseWithMoneyAndCount(data, 'Languages')
data2 = language.sort_values(by='Total', ascending=False)[0:20]

fig = plt.figure(figsize=(8, 4))
plt.bar(data=data2, x='Languages', height='Count', color="salmon")
plt.ylabel("Releases", fontsize=20)
plt.xticks(rotation=90, fontsize=10)
plt.xlabel("Languages", fontsize=20)
plt.title("Release By Language", fontsize=20)
plt.show()


data2 = data.copy()
data2['IsEnglish'] = data2['Languages'].apply(lambda x: 1 if 'English' in x else 0)
data2['IsEnglish'].value_counts()

1    8150
0     602
Name: IsEnglish, dtype: int64


corr = pearsonr(data2['IsEnglish'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between IsEnnglish and Gross: %.3f' % corr)
data2.plot.scatter(x='IsEnglish', y='Gross_worldwide', color='blue')

Pearsons correlation between IsEnnglish and Gross: 0.076

<AxesSubplot:xlabel='IsEnglish', ylabel='Gross_worldwide'>


final['IsEnglish'] = data2['IsEnglish']


data2 = data.copy()
data2['NumKeywords'] = data2['Keywords'].apply(lambda x: len(x))
data2['NumKeywords'].value_counts()

5    8566
1      89
2      34
4      24
3      21
0      18
Name: NumKeywords, dtype: int64


corr = pearsonr(data2['NumKeywords'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between NumKeywords and Gross: %.3f' % corr)
data2.plot.scatter(x='NumKeywords', y='Gross_worldwide', color='blue')

Pearsons correlation between NumKeywords and Gross: 0.043

<AxesSubplot:xlabel='NumKeywords', ylabel='Gross_worldwide'>


keyword = parseWithMoneyAndCount(data, 'Keywords')
fig = plt.figure(figsize=(8, 4))
data2 = keyword.sort_values(by='Count', ascending=False)[0:15]
plt.bar(data=data2, x='Keywords', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=10)
plt.title("Number of movie releases with certain kind of keywords", fontsize=15)
plt.ylabel("Count", fontsize=15)

Text(0, 0.5, 'Count')


keyword.sort_values(by='Mean', ascending=False)[0:10]


keyword.sort_values(by='Median', ascending=False)[0:10]


count50Keywords = keyword[keyword['Count'] > 20]
count50Keywords.sort_values(by='Mean', ascending=False)


count50Keywords.sort_values(by='Mean', ascending=False)[0:30].plot.bar(x='Keywords', y='Mean', color='green')

<AxesSubplot:xlabel='Keywords'>


count50Keywords = count50Keywords.sort_values(by='Mean', ascending=True).reset_index(drop=True)
keywordRank = dict()
for i, row in enumerate(count50Keywords['Keywords']):
    keywordRank[row] = i + 1  ## Plus 1 in order to release the 0 position for another film
keywordRank

{'independent film': 1,
 'homosexual': 2,
 'male full frontal nudity': 3,
 'topless female nudity': 4,
 'one word title': 5,
 'scene during opening credits': 6,
 'timeframe 1990s': 7,
 'lust': 8,
 'title directed by female': 9,
 'female frontal nudity': 10,
 'three word title': 11,
 'infidelity': 12,
 'written by director': 13,
 'male frontal nudity': 14,
 'student': 15,
 'african american': 16,
 'bare breasts': 17,
 'lesbian': 18,
 'hood': 19,
 'bar': 20,
 'actor': 21,
 'breasts': 22,
 'two word title': 23,
 'vietnam war veteran': 24,
 'restaurant': 25,
 'secret': 26,
 'gay': 27,
 'watching tv': 28,
 'coming of age': 29,
 'slasher': 30,
 'obsession': 31,
 'widow': 32,
 '1950s': 33,
 'period drama': 34,
 'public nudity': 35,
 'panties': 36,
 'satire': 37,
 'woman on top': 38,
 'jewish': 39,
 'male rear nudity': 40,
 'looking at oneself in a mirror': 41,
 'money': 42,
 'f rated': 43,
 'surrealism': 44,
 'aerial camera shot': 45,
 'lesbian kiss': 46,
 'character name as title': 47,
 'teacher': 48,
 'female full frontal nudity': 49,
 'bully': 50,
 'reporter': 51,
 '1930s': 52,
 'coach': 53,
 'waitress': 54,
 'teenage boy': 55,
 'love': 56,
 'movie flop': 57,
 'fbi federal bureau of investigation': 58,
 'sex': 59,
 'italy': 60,
 '1960s': 61,
 'apartment': 62,
 'marriage': 63,
 'police officer': 64,
 'male nudity': 65,
 'bare chested male': 66,
 'four word title': 67,
 'adultery': 68,
 'girl': 69,
 'widower': 70,
 'vietnam': 71,
 'car accident': 72,
 'friend': 73,
 'doctor': 74,
 'boy': 75,
 'pubic hair': 76,
 'teenage girl': 77,
 'boyfriend girlfriend relationship': 78,
 'husband wife relationship': 79,
 'nudity': 80,
 'baseball': 81,
 'neo noir': 82,
 'rape': 83,
 'black comedy': 84,
 'gang': 85,
 'hospital': 86,
 'white panties': 87,
 'voyeur': 88,
 'character names as title': 89,
 'small town': 90,
 'concert': 91,
 'college': 92,
 'thief': 93,
 'writer': 94,
 'pregnancy': 95,
 '1920s': 96,
 'priest': 97,
 'sex comedy': 98,
 'lingerie': 99,
 'hare krishna': 100,
 'robbery': 101,
 'cult film': 102,
 'suicide': 103,
 'femme fatale': 104,
 'church': 105,
 'actress': 106,
 'singer': 107,
 'london england': 108,
 'road trip': 109,
 'seduction': 110,
 'journalist': 111,
 'bikini': 112,
 '1940s': 113,
 'texas': 114,
 'family relationships': 115,
 'female rear nudity': 116,
 'female nudity': 117,
 'dance': 118,
 'gangster': 119,
 'man wears eyeglasses': 120,
 'sex scene': 121,
 'automobile': 122,
 'brother brother relationship': 123,
 'lawyer': 124,
 'extramarital affair': 125,
 'prison': 126,
 'murder': 127,
 'airport': 128,
 'nun': 129,
 'grief': 130,
 'criminal': 131,
 'serial killer': 132,
 'cancer': 133,
 'psychotronic film': 134,
 'neighbor': 135,
 'columbia tristar': 136,
 'slimehouse': 137,
 'spoof': 138,
 'male objectification': 139,
 'best friend': 140,
 'school': 141,
 'military': 142,
 'based on real person': 143,
 'voyeurism': 144,
 'detective': 145,
 'slapstick comedy': 146,
 'dysfunctional family': 147,
 'love triangle': 148,
 'basketball': 149,
 'private detective': 150,
 'nightclub': 151,
 'farce': 152,
 'flashback': 153,
 'high school': 154,
 'teenager': 155,
 'vietnam war': 156,
 'politics': 157,
 'nightmare': 158,
 'hitman': 159,
 'pantyhose': 160,
 'mafia': 161,
 'jealousy': 162,
 'drugs': 163,
 'blonde': 164,
 'dancing': 165,
 'village': 166,
 'neo screwball comedy': 167,
 'cleavage': 168,
 'scantily clad female': 169,
 'racism': 170,
 'mother daughter relationship': 171,
 'based on true story': 172,
 'psychopath': 173,
 'dream': 174,
 'nazi': 175,
 'united states': 176,
 'new york city': 177,
 '1970s': 178,
 'undercover': 179,
 'prostitute': 180,
 'investigation': 181,
 'mother son relationship': 182,
 'boxing': 183,
 'road movie': 184,
 'farm': 185,
 'musician': 186,
 'competition': 187,
 'blood': 188,
 'train': 189,
 'usa': 190,
 'psychiatrist': 191,
 'los angeles california': 192,
 'horse': 193,
 'new york': 194,
 'stripper': 195,
 '1980s': 196,
 'california': 197,
 'singing in a car': 198,
 'baby': 199,
 'nurse': 200,
 'police': 201,
 'faith': 202,
 'on the road': 203,
 'torture': 204,
 'supernatural horror': 205,
 'bound and gagged': 206,
 'father son relationship': 207,
 'disney': 208,
 'vomiting': 209,
 'female protagonist': 210,
 'kiss': 211,
 'single mother': 212,
 'hostage': 213,
 'england': 214,
 'car': 215,
 'violence': 216,
 'revenge': 217,
 'beach': 218,
 'parody': 219,
 'woman in jeopardy': 220,
 'india': 221,
 'paris france': 222,
 'brother sister relationship': 223,
 'friendship': 224,
 'funeral': 225,
 'strong female character': 226,
 'christmas': 227,
 'on the run': 228,
 'party': 229,
 '2010s': 230,
 'desert': 231,
 'divorce': 232,
 'fish out of water': 233,
 'japan': 234,
 'dog': 235,
 'death': 236,
 'scientist': 237,
 '1990s': 238,
 'father daughter relationship': 239,
 'no opening credits': 240,
 'chase': 241,
 'escape': 242,
 'male protagonist': 243,
 'memory': 244,
 'fight': 245,
 'singing': 246,
 'mexico': 247,
 'soldier': 248,
 'snow': 249,
 'sister sister relationship': 250,
 'ghost': 251,
 'time bomb': 252,
 'wedding': 253,
 'hotel': 254,
 'zombie': 255,
 'france': 256,
 'heist': 257,
 'robot': 258,
 'gun': 259,
 'deception': 260,
 '2000s': 261,
 'cat': 262,
 'based on novel': 263,
 'martial arts': 264,
 'demon': 265,
 'kidnapping': 266,
 'race against time': 267,
 'teen movie': 268,
 'conspiracy': 269,
 'survival': 270,
 'rescue': 271,
 'china': 272,
 'world war two': 273,
 'orphan': 274,
 'assassin': 275,
 'vampire': 276,
 'witch': 277,
 '3 dimensional': 278,
 'island': 279,
 'astronaut': 280,
 'outer space': 281,
 'hero': 282,
 'terrorist': 283,
 'good versus evil': 284,
 'dystopia': 285,
 'supernatural power': 286,
 'remake': 287,
 'second part': 288,
 'monster': 289,
 'spy': 290,
 'africa': 291,
 'surprise ending': 292,
 'battle': 293,
 'post apocalypse': 294,
 'alien': 295,
 'betrayal': 296,
 'villain': 297,
 'jungle': 298,
 'time travel': 299,
 'future': 300,
 'sequel': 301,
 'king': 302,
 'princess': 303,
 'magic': 304,
 'based on comic book': 305,
 'superhero': 306}


import random


def getKeywordsRank(keywords):
    max = 0
    for keyword in keywords:
        if keyword not in keywordRank.keys():
            continue
        if keywordRank[keyword] > max:
            max = keywordRank[keyword]
    if max == 0:
        max = random.randint(1, 100)
    return max


data2 = data.copy()
data2['keywordRank'] = data2['Keywords'].apply(getKeywordsRank)
corr = pearsonr(data2['keywordRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between keywordRank and Gross: %.3f' % corr)
data2.plot.scatter(x='keywordRank', y='Gross_worldwide', color='blue')

Pearsons correlation between keywordRank and Gross: 0.167

<AxesSubplot:xlabel='keywordRank', ylabel='Gross_worldwide'>


import random


def getKeywordsRank(keywords):
    total = 0
    length = len(keywords)
    if length == 0:
        length = 1
    for keyword in keywords:
        if keyword not in keywordRank.keys():
            total += random.randint(1, 100)
            continue
        total += keywordRank[keyword]
    return total / length


data2 = data.copy()
data2['keywordRank'] = data2['Keywords'].apply(getKeywordsRank)
corr = pearsonr(data2['keywordRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between keywordRank and Gross: %.3f' % corr)
data2.plot.scatter(x='keywordRank', y='Gross_worldwide', color='blue')

Pearsons correlation between keywordRank and Gross: 0.162

<AxesSubplot:xlabel='keywordRank', ylabel='Gross_worldwide'>


final['keywordRank'] = data2['keywordRank']


certificate = parseWithMoneyAndCount(data, 'ListOfCertificate')
fig = plt.figure(figsize=(8, 4))
data2 = certificate.sort_values(by='Count', ascending=False)
plt.bar(data=data2, x='ListOfCertificate', height='Count', color="salmon")
plt.xticks(rotation=90, fontsize=15)
plt.ylabel("Count", fontsize=20)
plt.title("Releases By Certificate", fontsize=20)
plt.show()
fig = plt.figure(figsize=(8, 4))
data2 = certificate.sort_values(by='Median', ascending=False)
plt.bar(data=data2, x='ListOfCertificate', height='Median', color="salmon")
plt.xticks(rotation=90, fontsize=15)
plt.ylabel("Median Gross", fontsize=20)
plt.title("Gross By Certificate", fontsize=20)
plt.show()
fig = plt.figure(figsize=(8, 4))
data2 = certificate.sort_values(by='Mean', ascending=False)
plt.bar(data=data2, x='ListOfCertificate', height='Mean', color="salmon")
plt.xticks(rotation=90, fontsize=15)
plt.ylabel("Mean Gross", fontsize=20)
plt.title("Gross By Certificate", fontsize=20)
plt.show()


certificate


certificate = certificate.sort_values(by='Mean', ascending=True).reset_index(drop=True)
cerRank = dict()
for i, row in enumerate(certificate['ListOfCertificate']):
    cerRank[row] = i + 1
cerRank

{'NC-17': 1, 'R': 2, 'G': 3, 'PG': 4, 'PG-13': 5}


def getCerRank(cers):
    max = 0
    for cer in cers:
        if cer not in cerRank.keys():
            continue
        if cerRank[cer] > max:
            max = cerRank[cer]
    return max


data2 = data.copy()
data2['cerRank'] = data2['ListOfCertificate'].apply(getCerRank)
corr = pearsonr(data2['cerRank'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between cerRank and Gross: %.3f' % corr)
data2.plot.scatter(x='cerRank', y='Gross_worldwide', color='blue')

Pearsons correlation between cerRank and Gross: 0.199

<AxesSubplot:xlabel='cerRank', ylabel='Gross_worldwide'>


final['cerRank'] = data2['cerRank']


data2 = data.copy()
data2['PG-13'] = data2['ListOfCertificate'].apply(lambda x: 1 if 'PG-13' in x else 0)
corr = pearsonr(data2['PG-13'], data2['Gross_worldwide'])[0]
print('Pearsons correlation between PG-13 and Gross: %.3f' % corr)
data2.plot.scatter(x='PG-13', y='Gross_worldwide', color='blue')

Pearsons correlation between PG-13 and Gross: 0.173

<AxesSubplot:xlabel='PG-13', ylabel='Gross_worldwide'>


final.to_csv('../dataset/extracted/feature_extracted.csv', index=False)

	Movie_Title	Movie_ID	Budget	Cast	Crew	Studios	Genre	Keywords	Languages	Countries	Release_Data	Runtime	Gross_worldwide	Rating	Rating_Count	ListOfCertificate	Release_Year	Release_Month	Release_Day
0	Star Wars: Episode VII - The Force Awakens	2488496	245000000	['Daisy Ridley', 'John Boyega', 'Oscar Isaac',...	['Lawrence Kasdan', 'Michael Arndt', 'J.J. Abr...	['Lucasfilm', 'Bad Robot', 'Truenorth Producti...	['Action', 'Adventure', 'Sci-Fi']	['reboot', 'sanitation employee', 'remake', 'c...	['English']	['United States']	2015-12-18	138.0	2069521700	7.8	893000	['PG-13']	2015	12	18
1	Frozen II	4520988	150000000	['Kristen Bell', 'Idina Menzel', 'Josh Gad', '...	['Jennifer Lee', 'Hans Christian Andersen', 'C...	['Walt Disney Animation Studios', 'Walt Disney...	['Animation', 'Adventure', 'Comedy', 'Family',...	['autumn', 'anthropomorphic snowman', 'princes...	['English']	['United States']	2019-11-22	103.0	1450026933	6.8	156000	['PG']	2019	11	22
2	The Dark Knight Rises	1345836	250000000	['Christian Bale', 'Tom Hardy', 'Anne Hathaway...	['Jonathan Nolan', 'Christopher Nolan', 'David...	['Warner Bros.', 'Legendary Entertainment', 'D...	['Action', 'Crime', 'Drama']	['dc comics', 'batman character', 'bruce wayne...	['English', 'Arabic']	['United Kingdom', 'United States']	2012-07-27	164.0	1081142612	8.4	1600000	['PG-13']	2012	7	27
3	Beauty and the Beast	2771200	160000000	['Emma Watson', 'Dan Stevens', 'Luke Evans', '...	['Evan Spiliotopoulos', 'Bill Condon', 'Stephe...	['Mandeville Films', 'Walt Disney Pictures']	['Adventure', 'Family', 'Fantasy', 'Musical', ...	['beast', 'fairy tale', 'heroine', "beast's he...	['English']	['United States']	2017-03-17	129.0	1273576220	7.1	293000	['PG']	2017	3	17
4	Finding Dory	2277860	200000000	['Ellen DeGeneres', 'Albert Brooks', "Ed O'Nei...	['Angus MacLane', 'Victoria Strouse', 'Andrew ...	['Pixar Animation Studios', 'Walt Disney Pictu...	['Animation', 'Adventure', 'Comedy', 'Family']	['fish', 'ocean', 'whale', 'octopus driving a ...	['English', 'Indonesian']	['United States']	2016-06-17	97.0	1028570942	7.3	259000	['PG']	2016	6	17

	Movie_ID	Budget	Runtime	Gross_worldwide	Rating	Rating_Count	Release_Year	Release_Month	Release_Day
count	8.752000e+03	8.752000e+03	8752.000000	8.752000e+03	8752.000000	8.752000e+03	8752.000000	8752.000000	8752.000000
mean	1.044857e+06	2.351619e+07	108.012117	7.172680e+07	6.430884	8.347387e+04	2000.010398	6.688643	16.042162
std	1.743793e+06	3.713275e+07	19.559340	1.564077e+08	1.005946	1.641414e+05	13.471522	3.416011	8.602773
min	1.234900e+04	2.200000e+02	45.000000	9.500000e+01	1.400000	0.000000e+00	1921.000000	1.000000	1.000000
25%	1.023130e+05	2.200000e+02	95.000000	4.443069e+06	5.800000	7.100000e+03	1991.000000	4.000000	9.000000
50%	2.504325e+05	1.000000e+07	105.000000	1.821152e+07	6.500000	2.600000e+04	2002.000000	7.000000	16.000000
75%	1.221610e+06	3.000000e+07	117.000000	6.560984e+07	7.200000	8.600000e+04	2011.000000	10.000000	23.000000
max	1.103237e+07	3.560000e+08	357.000000	2.847246e+09	9.300000	2.500000e+06	2021.000000	12.000000	31.000000

	Genre	Total	Count	Mean	Median
0	Action	270712872044	1892	1.430829e+08	50747624.5
1	Adventure	298281440727	1515	1.968854e+08	68514844.0
2	Animation	84311140430	400	2.107779e+08	104469116.5
3	Biography	26964736252	663	4.067079e+07	13448497.0
4	Comedy	222959609180	3232	6.898503e+07	20980459.5
5	Crime	87149222552	1589	5.484533e+07	19870567.0
6	Documentary	2710526736	243	1.115443e+07	2702578.0
7	Drama	224422468047	4454	5.038672e+07	14657538.0
8	Family	137386773092	936	1.467807e+08	51444620.5
9	Fantasy	153116140508	981	1.560817e+08	50693129.0
10	Film-Noir	4890859	11	4.446235e+05	22356.0
11	History	18744144753	391	4.793899e+07	13130349.0
12	Horror	39714174296	707	5.617281e+07	25051865.0
13	Music	19600102040	449	4.365279e+07	11749595.0
14	Musical	30446413474	279	1.091269e+08	22762571.0
15	Mystery	62314981386	880	7.081248e+07	23673201.0
16	News	57917412	3	1.930580e+07	4606199.0
17	Romance	109277844855	2201	4.964918e+07	15164458.0
18	Sci-Fi	152723454914	866	1.763550e+08	51939597.0
19	Sport	16388866644	374	4.382050e+07	17712898.0
20	Thriller	169670494650	2224	7.629069e+07	26492560.5
21	War	20548159884	357	5.755787e+07	15291277.0
22	Western	7393407155	143	5.170215e+07	15164458.0

	Genre	Total	Count	Mean	Median
7	Drama	224422468047	4454	5.038672e+07	14657538.0
4	Comedy	222959609180	3232	6.898503e+07	20980459.5
20	Thriller	169670494650	2224	7.629069e+07	26492560.5
17	Romance	109277844855	2201	4.964918e+07	15164458.0
0	Action	270712872044	1892	1.430829e+08	50747624.5
5	Crime	87149222552	1589	5.484533e+07	19870567.0
1	Adventure	298281440727	1515	1.968854e+08	68514844.0
9	Fantasy	153116140508	981	1.560817e+08	50693129.0
8	Family	137386773092	936	1.467807e+08	51444620.5
15	Mystery	62314981386	880	7.081248e+07	23673201.0
18	Sci-Fi	152723454914	866	1.763550e+08	51939597.0
12	Horror	39714174296	707	5.617281e+07	25051865.0
3	Biography	26964736252	663	4.067079e+07	13448497.0
13	Music	19600102040	449	4.365279e+07	11749595.0
2	Animation	84311140430	400	2.107779e+08	104469116.5
11	History	18744144753	391	4.793899e+07	13130349.0
19	Sport	16388866644	374	4.382050e+07	17712898.0
21	War	20548159884	357	5.755787e+07	15291277.0
14	Musical	30446413474	279	1.091269e+08	22762571.0
6	Documentary	2710526736	243	1.115443e+07	2702578.0
22	Western	7393407155	143	5.170215e+07	15164458.0
10	Film-Noir	4890859	11	4.446235e+05	22356.0
16	News	57917412	3	1.930580e+07	4606199.0

	Cast	Total	Count	Mean	Median
61269	Samuel L. Jackson	18580682439	83	2.238636e+08	62022014.0
58569	Robert De Niro	6722811414	77	8.730924e+07	45491656.0
49987	Morgan Freeman	10397736669	65	1.599652e+08	95943453.0
33993	John Goodman	6019264965	62	9.708492e+07	37207906.0
41569	Liam Neeson	7206028513	61	1.181316e+08	48878502.0
...	...	...	...	...	...
28218	István Znamenák	1240663	1	1.240663e+06	1240663.0
28219	Italia Coppola	636796	1	6.367960e+05	636796.0
28221	Italo Renda	34861529	1	3.486153e+07	34861529.0
28222	Itandehui Gutierrez	120673227	1	1.206732e+08	120673227.0
73025	Þrúður Kristjánsdóttir	727594	1	7.275940e+05	727594.0

Exploratory Data Analysis¶

Multivalued attributes with gross and count¶

Brief information about data¶

Heatmap of Correlation Matrix, Histogram and Scatter Matrix¶

`Genre` analysis¶

Release_Day analysis¶

Release_Month analysis¶

Budget analysis¶

Cast Analysis¶

Crew analysis¶

Studio Analysis¶

Production Countries Analysis¶

Language analysis¶

Keywords analysis¶

MPAA Analysis¶

Export to CSV¶

	Cast	Total	Count	Mean	Median
62173	Sean Anthony Moran	2847246203	1	2.847246e+09	2.847246e+09
30536	Jason Whyte	2847246203	1	2.847246e+09	2.847246e+09
62306	Sean Patrick Murphy	2847246203	1	2.847246e+09	2.847246e+09
29659	James Patrick Pitt	2847246203	1	2.847246e+09	2.847246e+09
38160	Kelly Kilgour	2847246203	1	2.847246e+09	2.847246e+09
...	...	...	...	...	...
25851	Hammou Abaou	95	1	9.500000e+01	9.500000e+01
31262	Jeff Prewett	95	1	9.500000e+01	9.500000e+01
28953	Jacqueline Harris	95	1	9.500000e+01	9.500000e+01
46652	Matthew R. Anderson	95	1	9.500000e+01	9.500000e+01
34893	Jonas Ball	95	1	9.500000e+01	9.500000e+01

	Cast	Total	Count	Mean	Median
0	Rupert Grint	7786412658	8	9.733016e+08	9.383132e+08
1	Lupita Nyong'o	7278884954	8	9.098606e+08	1.020352e+09
2	Evangeline Lilly	6209206084	7	8.870294e+08	6.226741e+08
3	Chadwick Boseman	7668338102	9	8.520376e+08	1.506809e+08
4	Karen Gillan	8474182724	10	8.474183e+08	7.867049e+08
...	...	...	...	...	...
4569	Deborah Kerr	21481898	6	3.580316e+06	3.325950e+04
4570	Rita Taggart	19991204	6	3.331867e+06	2.243077e+06
4571	Tony Curtis	26234046	8	3.279256e+06	1.610252e+06
4572	Dick Cavett	30866179	10	3.086618e+06	1.324484e+06
4573	Arsinée Khanjian	15305635	6	2.550939e+06	3.003460e+06

	Crew	Total	Count	Mean	Median
10649	Woody Allen	1192064607	43	2.772243e+07	14792779.0
1879	Clint Eastwood	3458362840	38	9.100955e+07	53572298.0
9561	Stephen King	1891234174	36	5.253428e+07	22759009.5
9715	Steven Spielberg	10743871515	34	3.159962e+08	288074136.5
5181	John Hughes	2930471621	33	8.880217e+07	49944325.0
...	...	...	...	...	...
4407	James Leo Herlihy	44801177	1	4.480118e+07	44801177.0
4406	James Lee Burke	5009305	1	5.009305e+06	5009305.0
4405	James Lee Barrett	126737428	1	1.267374e+08	126737428.0
4404	James Lasdun	2048740	1	2.048740e+06	2048740.0
10777	Óskar Jónasson	96262212	1	9.626221e+07	96262212.0

	Crew	Total	Count	Mean	Median
639	Anthony Russo	6844248984	5	1.368850e+09	1.153337e+09
5007	Joe Russo	6844248984	5	1.368850e+09	1.153337e+09
5004	Joe Robert Cole	1347597973	1	1.347598e+09	1.347598e+09
9486	Stan Lee	13024534758	12	1.085378e+09	9.784765e+08
5526	Josh Cooley	1073394593	1	1.073395e+09	1.073395e+09
5198	John Knoll	1056057720	1	1.056058e+09	1.056058e+09
3046	Eric Guillon	1034800131	1	1.034800e+09	1.034800e+09
557	Angus MacLane	1028570942	1	1.028571e+09	1.028571e+09
10351	Victoria Strouse	1028570942	1	1.028571e+09	1.028571e+09
5389	Jon Watts	2012094920	2	1.006047e+09	1.006047e+09
4760	Jennifer Lee	2864210897	3	9.547370e+08	1.281508e+09
8205	Pierre Coffin	3708124783	4	9.270312e+08	1.002783e+09
4229	J.K. Rowling	9255312560	10	9.255313e+08	9.155662e+08
6324	Lori Forte	877244782	1	8.772448e+08	8.772448e+08
10688	Yi Liu	870325439	1	8.703254e+08	8.703254e+08
8264	Qun Dong	870325439	1	8.703254e+08	8.703254e+08
4935	Jing Wu	870325439	1	8.703254e+08	8.703254e+08
4475	Jan Duursema	868390560	1	8.683906e+08	8.683906e+08
9571	Stephen McFeely	8655378624	10	8.655379e+08	5.322244e+08
526	Andy Lanning	863756051	1	8.637561e+08	8.637561e+08

	Studios	Total	Count	Mean	Median
6417	Truenorth Productions	2069521700	1	2.069522e+09	2.069522e+09
3434	Jason Roberts Productions	2048359754	1	2.048360e+09	2.048360e+09
5820	South Pictures	2048359754	1	2.048360e+09	2.048360e+09
1021	British Film Commission	1662899439	1	1.662899e+09	1.662899e+09
6616	Vita-Ray Dutch Productions (III)	1153337496	1	1.153337e+09	1.153337e+09
...	...	...	...	...	...
4827	Paramount Famous Lasky Corporation	746	1	7.460000e+02	7.460000e+02
6659	Walter Wanger Productions	623	1	6.230000e+02	6.230000e+02
994	Break Media	528	1	5.280000e+02	5.280000e+02
1174	Campfire	528	1	5.280000e+02	5.280000e+02
3439	Jaz Films	95	1	9.500000e+01	9.500000e+01

	Studios	Total	Count	Mean	Median
4117	Marvel Studios	23795322645	26	9.152047e+08	813667029.0
3830	Lightstorm Entertainment	6754939278	9	7.505488e+08	378882411.0
3226	Illumination Entertainment	6693236668	10	6.693237e+08	588661184.5
4979	Pixar Animation Studios	14528032320	22	6.603651e+08	601716911.5
6029	Syncopy	3775191007	6	6.291985e+08	597530912.5
...	...	...	...	...	...
6843	Zenith Entertainment	17321794	8	2.165224e+06	1708545.5
4453	National Film Board of Canada (NFB)	14559833	7	2.079976e+06	1601612.0
4693	Ontario Film Development Corporation	8636461	6	1.439410e+06	1364006.0
3677	La Sept Cinéma	10915352	8	1.364419e+06	1229040.0
1593	Concorde Pictures	8463581	7	1.209083e+06	1242995.0

	Countries	Total	Count	Mean	Median
6	Bahamas	616502912	1	6.165029e+08	616502912.0
32	Fiji	429632142	1	4.296321e+08	429632142.0
62	Malta	3027543520	11	2.752312e+08	240697856.0
67	Morocco	3758242989	14	2.684459e+08	151314187.0
72	New Zealand	10467742795	48	2.180780e+08	74271180.0
...	...	...	...	...	...
10	Bhutan	1792370	2	8.961850e+05	896185.0
93	Soviet Union	2311743	3	7.705810e+05	93292.0
35	Georgia	686704	1	6.867040e+05	686704.0
58	Liberia	555533	1	5.555330e+05	555533.0
39	Haiti	352296	1	3.522960e+05	352296.0

	Keywords	Total	Count	Mean	Median
4833	forest protection	2847246203	1	2.847246e+09	2.847246e+09
2911	cosmic	2797501328	1	2.797501e+09	2.797501e+09
7144	love affair	2201647264	1	2.201647e+09	2.201647e+09
10605	sailor's death	2201647264	1	2.201647e+09	2.201647e+09
11980	sunken ship	2201647264	1	2.201647e+09	2.201647e+09
10649	sanitation employee	2069521700	1	2.069522e+09	2.069522e+09
3314	death of recurring character	2048359754	1	2.048360e+09	2.048360e+09
10574	s.h.i.e.l.d.	1518815515	1	1.518816e+09	1.518816e+09
2063	car falling off a cliff	1515341399	1	1.515341e+09	1.515341e+09
3521	disney animated sequel	1450026933	1	1.450027e+09	1.450027e+09

	Keywords	Total	Count	Mean	Median
11995	superhero	32041168607	61	5.252651e+08	370569774.0
1120	based on comic book	10474265400	25	4.189706e+08	230884728.0
7217	magic	10738490415	36	2.982914e+08	111927462.0
9413	princess	6618049857	25	2.647220e+08	55534455.0
6675	king	7831973626	30	2.610658e+08	57671894.0
...	...	...	...	...	...
8615	one word title	1203447244	68	1.769775e+07	6258238.5
12603	topless female nudity	609853613	37	1.648253e+07	3039587.0
7275	male full frontal nudity	434996581	30	1.449989e+07	3005289.5
5839	homosexual	361636313	35	1.033247e+07	5526675.0
6154	independent film	166134909	33	5.034391e+06	1467396.0

	ListOfCertificate	Total	Count	Mean	Median
0	G	42824185750	937	4.570351e+07	6780490.0
1	NC-17	925845011	37	2.502284e+07	7412216.0
2	PG	144055409565	1621	8.886824e+07	23237911.0
3	PG-13	283553468080	2466	1.149852e+08	34714400.0
4	R	170006305715	3867	4.396336e+07	14000000.0

	Crew	Total	Count	Mean	Median
639	Anthony Russo	6844248984	5	1.368850e+09	1.153337e+09
5007	Joe Russo	6844248984	5	1.368850e+09	1.153337e+09
9486	Stan Lee	13024534758	12	1.085378e+09	9.784765e+08
4229	J.K. Rowling	9255312560	10	9.255313e+08	9.155662e+08
9571	Stephen McFeely	8655378624	10	8.655379e+08	5.322244e+08
2494	David Yates	6021591899	7	8.602274e+08	9.344541e+08
1779	Christopher Markus	9400391739	11	8.545811e+08	6.447831e+08
6172	Lee Unkrich	3892224463	5	7.784449e+08	8.078179e+08
8173	Philippa Boyens	6663807561	9	7.404231e+08	9.474951e+08
489	Andrew Stanton	5672435426	8	7.090544e+08	7.308323e+08
6992	Michael Arndt	3359222246	5	6.718444e+08	2.861686e+08
1237	Brian Lynch	3203896208	5	6.407792e+08	5.549875e+08
9637	Steve Kloves	7669781515	12	6.391485e+08	8.384820e+08
1281	Bruce Geller	3173171913	5	6.346344e+08	6.827166e+08
1691	Chris Renaud	3168230230	5	6.336460e+08	5.431140e+08
4687	Jeff Nathanson	4425072143	7	6.321532e+08	3.521143e+08
8784	Robert Wade	4923637935	8	6.154547e+08	6.030417e+08
1680	Chris McKenna	3059989349	5	6.119979e+08	6.226741e+08
3256	Fran Walsh	6696215912	11	6.087469e+08	8.976901e+08
2593	Derek Connolly	3020187001	5	6.040374e+08	5.666528e+08

Exploratory Data Analysis¶

Multivalued attributes with gross and count¶

Brief information about data¶

Heatmap of Correlation Matrix, Histogram and Scatter Matrix¶

Genre analysis¶

Release_Day analysis¶

Release_Month analysis¶

Budget analysis¶

Cast Analysis¶

Crew analysis¶

Studio Analysis¶

Production Countries Analysis¶

Language analysis¶

Keywords analysis¶

MPAA Analysis¶

Export to CSV¶

`Genre` analysis¶