import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.mode.chained_assignment = None
Load and drop unnecessary attribute, e.g., Filming_Location
dataset_path = "../dataset/"
data = pd.read_csv(dataset_path + "data_joined.csv")
data = data.drop('Filming_Location', axis=1)
a = data
a.reset_index(drop=True, inplace=True)
a.head()
Movie_Title | Movie_ID | Budget | Cast | Crew | Studios | Genre | Keywords | Languages | Countries | Release_Data | Runtime | Gross_worldwide | Rating | Rating_Count | ListOfCertificate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Star Wars: Episode VII - The Force Awakens | 2488496 | $245,000,000 | Daisy Ridley,John Boyega,Oscar Isaac,Domhnall ... | Lawrence Kasdan,Michael Arndt,J.J. Abrams | Lucasfilm,Bad Robot,Truenorth Productions | Action,Adventure,Sci-Fi | reboot,sanitation employee,remake,crash landin... | English | United States | 2015-12-18 | 2 hours 18 minutes | $2,069,521,700 | 7.8 | 893K | PG-13 |
1 | Frozen II | 4520988 | $150,000,000 | Kristen Bell,Idina Menzel,Josh Gad,Jonathan Gr... | Jennifer Lee,Hans Christian Andersen,Chris Buck | Walt Disney Animation Studios,Walt Disney Pict... | Animation,Adventure,Comedy,Family,Fantasy,Musical | autumn,anthropomorphic snowman,princess,disney... | English | United States | 2019-11-22 | 1 hour 43 minutes | $1,450,026,933 | 6.8 | 156K | PG |
2 | The Dark Knight Rises | 1345836 | $250,000,000 | Christian Bale,Tom Hardy,Anne Hathaway,Gary Ol... | Jonathan Nolan,Christopher Nolan,David S. Goyer | Warner Bros.,Legendary Entertainment,DC Entert... | Action,Crime,Drama | dc comics,batman character,bruce wayne charact... | English,Arabic | United Kingdom,United States | 2012-07-27 | 2 hours 44 minutes | $1,081,142,612 | 8.4 | 1.6M | PG-13 |
3 | Beauty and the Beast | 2771200 | $160,000,000 | Emma Watson,Dan Stevens,Luke Evans,Josh Gad,Ke... | Evan Spiliotopoulos,Bill Condon,Stephen Chbosk... | Mandeville Films,Walt Disney Pictures | Adventure,Family,Fantasy,Musical,Romance | beast,fairy tale,heroine,beast's heart,remake ... | English | United States | 2017-03-17 | 2 hours 9 minutes | $1,273,576,220 | 7.1 | 293K | PG |
4 | Finding Dory | 2277860 | $200,000,000 | Ellen DeGeneres,Albert Brooks,Ed O'Neill,Kaitl... | Angus MacLane,Victoria Strouse,Andrew Stanton | Pixar Animation Studios,Walt Disney Pictures | Animation,Adventure,Comedy,Family | fish,ocean,whale,octopus driving a truck,talki... | English,Indonesian | United States | 2016-06-17 | 1 hour 37 minutes | $1,028,570,942 | 7.3 | 259K | PG |
Check for missing values
a.isnull().sum()
Movie_Title 0 Movie_ID 0 Budget 2930 Cast 9 Crew 4 Studios 49 Genre 1178 Keywords 27 Languages 11 Countries 6 Release_Data 0 Runtime 7 Gross_worldwide 1035 Rating 7 Rating_Count 7 ListOfCertificate 1199 dtype: int64
Drop all records missing Gross_worldwide
listToDrop = list(a[a['Gross_worldwide'].isna()].index)
a = a.drop(listToDrop)
Fill missing values:
Attribute | Fill with |
---|---|
Budget |
$0 |
Runtime |
0 |
Rating |
5.0 |
Rating_Count |
0K |
ListOfCertificate |
G |
Other null attributes are filled with empty string.
a['Budget'] = a['Budget'].fillna("$0")
a['Runtime'] = a['Runtime'].fillna("0")
a['Rating'] = a['Rating'].fillna(5.0)
a['Rating_Count'] = a['Rating_Count'].fillna("0K")
a['ListOfCertificate'] = a['ListOfCertificate'].fillna("G")
a = a.fillna("")
Drop all records with empty attributes
a = a.reset_index(drop=True)
Currency symbols in Budget
and Gross_worldwide
currency = [str(s)[0] for s in a['Budget'].unique()]
print(set(currency))
currency = [str(s)[0] for s in a['Gross_worldwide'].unique()]
print(set(currency))
{'€', '$'} {'$'}
Eliminate $
,€
and ,
in money string and convert the string to integer
def parse_currency(before_parsed):
if '$' in before_parsed:
after_parsed = int(before_parsed.strip('$').replace(',', ""))
else:
after_parsed = int(int(before_parsed.strip('€').replace(',', "")) * 1.14)
return after_parsed
a['Budget'] = a['Budget'].apply(parse_currency)
a['Gross_worldwide'] = a['Gross_worldwide'].apply(parse_currency)
a.head()
Movie_Title | Movie_ID | Budget | Cast | Crew | Studios | Genre | Keywords | Languages | Countries | Release_Data | Runtime | Gross_worldwide | Rating | Rating_Count | ListOfCertificate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Star Wars: Episode VII - The Force Awakens | 2488496 | 245000000 | Daisy Ridley,John Boyega,Oscar Isaac,Domhnall ... | Lawrence Kasdan,Michael Arndt,J.J. Abrams | Lucasfilm,Bad Robot,Truenorth Productions | Action,Adventure,Sci-Fi | reboot,sanitation employee,remake,crash landin... | English | United States | 2015-12-18 | 2 hours 18 minutes | 2069521700 | 7.8 | 893K | PG-13 |
1 | Frozen II | 4520988 | 150000000 | Kristen Bell,Idina Menzel,Josh Gad,Jonathan Gr... | Jennifer Lee,Hans Christian Andersen,Chris Buck | Walt Disney Animation Studios,Walt Disney Pict... | Animation,Adventure,Comedy,Family,Fantasy,Musical | autumn,anthropomorphic snowman,princess,disney... | English | United States | 2019-11-22 | 1 hour 43 minutes | 1450026933 | 6.8 | 156K | PG |
2 | The Dark Knight Rises | 1345836 | 250000000 | Christian Bale,Tom Hardy,Anne Hathaway,Gary Ol... | Jonathan Nolan,Christopher Nolan,David S. Goyer | Warner Bros.,Legendary Entertainment,DC Entert... | Action,Crime,Drama | dc comics,batman character,bruce wayne charact... | English,Arabic | United Kingdom,United States | 2012-07-27 | 2 hours 44 minutes | 1081142612 | 8.4 | 1.6M | PG-13 |
3 | Beauty and the Beast | 2771200 | 160000000 | Emma Watson,Dan Stevens,Luke Evans,Josh Gad,Ke... | Evan Spiliotopoulos,Bill Condon,Stephen Chbosk... | Mandeville Films,Walt Disney Pictures | Adventure,Family,Fantasy,Musical,Romance | beast,fairy tale,heroine,beast's heart,remake ... | English | United States | 2017-03-17 | 2 hours 9 minutes | 1273576220 | 7.1 | 293K | PG |
4 | Finding Dory | 2277860 | 200000000 | Ellen DeGeneres,Albert Brooks,Ed O'Neill,Kaitl... | Angus MacLane,Victoria Strouse,Andrew Stanton | Pixar Animation Studios,Walt Disney Pictures | Animation,Adventure,Comedy,Family | fish,ocean,whale,octopus driving a truck,talki... | English,Indonesian | United States | 2016-06-17 | 1 hour 37 minutes | 1028570942 | 7.3 | 259K | PG |
Multivalued data is seperated by comma ,
Cast
, Genre
, Studios
, ListOfCertificate
,
Keywords
, Languages
, Countries
are multivalued so we convert them to
List
data structure
def parse_multi_value(field_value):
array_value = field_value.split(',')
if '' in array_value:
return []
return array_value
cols = ['Cast', 'Genre', 'Studios', 'ListOfCertificate', 'Keywords', 'Languages', 'Countries', 'Crew']
for col in cols:
a[col] = a[col].apply(parse_multi_value)
Note! After change to list we will need this function this to load again in another part
"""
import ast
cols =['Cast', 'Genre', 'Studios', 'ListOfCertificate','Keywords', 'Languages', 'Countries']
for col in cols:
a[col]=a[col].apply(ast.literal_eval)
"""
"\n\nimport ast\ncols =['Cast', 'Genre', 'Studios', 'ListOfCertificate','Keywords', 'Languages', 'Countries']\nfor col in cols:\n a[col]=a[col].apply(ast.literal_eval)\n \n"
Process wrong values in ListOfCertificate
def get_unique_certificates(dataframe):
certificates = set()
for i in dataframe['ListOfCertificate']:
for y in i:
certificates.add(y)
return certificates
get_unique_certificates(a)
{'G', 'GP', 'M', 'M/PG', 'NC-17', 'PG', 'PG-13', 'R', 'X'}
Some of those certificates are out of date and does not follow MPAA's newest policy. They should be replaced:
M, GP and M/PG replaced by PG
X replaced by NC-17
def update_certificates(certificates):
new_certificates = set()
type1 = ['M', 'GP', 'M/PG']
type2 = ['X']
for i in certificates:
if i in type1:
new_certificates.add('PG')
elif i in type2:
new_certificates.add('NC-17')
else:
new_certificates.add(i)
return list(new_certificates)
a['ListOfCertificate'] = a['ListOfCertificate'].apply(update_certificates)
get_unique_certificates(a)
{'G', 'NC-17', 'PG', 'PG-13', 'R'}
dataframe
: Dataframe need to be processcol_name
: Name of the column to process, Cast
, Genre
and Studios
for example
def parseWithMoneyAndCount(dataframe, col_name):
result = []
count = []
gross = []
for i, record in enumerate(dataframe[col_name]):
for x in record:
# Save results to corresponding array
result.append(x)
gross.append(dataframe['Gross_worldwide'][i])
count.append(1)
# Make dataframe
t = pd.DataFrame({col_name: result, 'Money': gross, 'Count': count})
# Remove duplicates and sum corresponding columns
result = t.groupby(col_name).sum()
# Rearrange dataframe
sort_by_money = result.sort_values('Money', ascending=False)
return sort_by_money
def convertTime(time):
time = str(time)
# Loại bỏ khoảng trắng
time = time.replace(" ", "")
# nếu trong chuỗi có chứa 'hour' -> có 2 khả năng là chuỗi có chứa 'hours' hoặc 'hour'
if "hour" in time:
hours = 0
# Nếu chuỗi chứa 'hours', loại bỏ 'hours' và lấy ký tự đầu tiên nhân với 60
if "hours" in time:
hours = int(time[0]) * 60
after_eliminate_hour = time.replace("hours", "")
else:
# Nếu không thì nghĩa là ký tự đầu tiên bằng 1 => 60 phút
hours = 60
after_eliminate_hour = time.replace("hour", "")
minutes = 0
# Sau đó lấy số phút đó cộng với số phút nếu có đằng sau
if "minute" in after_eliminate_hour:
if "minutes" in after_eliminate_hour:
minutes = int(after_eliminate_hour.replace("minutes", "")[1:])
else:
minutes = 1
return int(hours) + int(minutes)
# Nếu chuỗi không chứa 'hour' -> chỉ cần loại bỏ chuỗi 'minutes' rồi chuyển về int là xong
else:
return int(time.replace("minutes", ""))
Chuẩn hóa thời gian cho bộ dữ liệu
a['Runtime'] = a['Runtime'].apply(convertTime)
a['Runtime'].value_counts()
100 259 97 258 96 245 101 244 95 236 ... 64 1 288 1 207 1 60 1 194 1 Name: Runtime, Length: 160, dtype: int64
Tách ngày, tháng, năm của từng bộ phim
a['Release_Data'] = pd.to_datetime(a['Release_Data'], format='%Y-%m-%d')
a['Release_Year'] = a['Release_Data'].apply(lambda x: x.year)
a['Release_Month'] = a['Release_Data'].apply(lambda x: x.month)
a['Release_Day'] = a['Release_Data'].apply(lambda x: x.day)
def convertRatingCount(rating_count):
rate = str(rating_count)
if 'M' in rate:
return int(float(rate.replace("M", "")) * 1000000)
elif 'K' in rate:
return int(float(rate.replace("K", "")) * 1000)
else:
return int(rate)
# Đưa dữ liệu cột rating_count về integer
a['Rating_Count'] = a['Rating_Count'].apply(convertRatingCount)
# Đưa dữ liệu cột rating về dạng float
a['Rating'] = a['Rating'].astype(float)
Fill 0 values:
Attribute | Fill with |
---|---|
Budget |
Min |
Runtime |
Mean |
## Find Budget 0
indexNZero = list(a[a['Budget'] != 0].index)
indexZero = list(a[a['Budget'] == 0].index)
min = np.min(list(a.iloc[indexNZero]['Budget']))
a['Budget'] = a['Budget'].apply(lambda x: min if x == 0 else x)
## Find Runtime 0
indexNZero = list(a[a['Runtime'] != 0].index)
indexZero = list(a[a['Runtime'] == 0].index)
mean = np.mean(list(a.iloc[indexNZero]['Runtime']))
a['Runtime'] = a['Runtime'].apply(lambda x: mean if x == 0 else x)
a.to_csv("../dataset/processed/cleaned_data.csv", index=False)