import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.options.mode.chained_assignment = None


                    dataset_path = "../dataset/"
data = pd.read_csv(dataset_path + "data_joined.csv")
data = data.drop('Filming_Location', axis=1)
a = data
a.reset_index(drop=True, inplace=True)
a.head()


                    a.isnull().sum()

Movie_Title             0
Movie_ID                0
Budget               2930
Cast                    9
Crew                    4
Studios                49
Genre                1178
Keywords               27
Languages              11
Countries               6
Release_Data            0
Runtime                 7
Gross_worldwide      1035
Rating                  7
Rating_Count            7
ListOfCertificate    1199
dtype: int64


                    listToDrop = list(a[a['Gross_worldwide'].isna()].index)
a = a.drop(listToDrop)


                    a['Budget'] = a['Budget'].fillna("$0")
a['Runtime'] = a['Runtime'].fillna("0")
a['Rating'] = a['Rating'].fillna(5.0)
a['Rating_Count'] = a['Rating_Count'].fillna("0K")
a['ListOfCertificate'] = a['ListOfCertificate'].fillna("G")
a = a.fillna("")


                    a = a.reset_index(drop=True)


                    currency = [str(s)[0] for s in a['Budget'].unique()]
print(set(currency))
currency = [str(s)[0] for s in a['Gross_worldwide'].unique()]
print(set(currency))

{'€', '$'}
{'$'}


                    def parse_currency(before_parsed):
    if '$' in before_parsed:
        after_parsed = int(before_parsed.strip('$').replace(',', ""))
    else:
        after_parsed = int(int(before_parsed.strip('€').replace(',', "")) * 1.14)
    return after_parsed


                    a['Budget'] = a['Budget'].apply(parse_currency)
a['Gross_worldwide'] = a['Gross_worldwide'].apply(parse_currency)
a.head()


                    def parse_multi_value(field_value):
    array_value = field_value.split(',')
    if '' in array_value:
        return []
    return array_value


                    cols = ['Cast', 'Genre', 'Studios', 'ListOfCertificate', 'Keywords', 'Languages', 'Countries', 'Crew']
for col in cols:
    a[col] = a[col].apply(parse_multi_value)


                    """

import ast
cols =['Cast', 'Genre', 'Studios', 'ListOfCertificate','Keywords', 'Languages', 'Countries']
for col in cols:
    a[col]=a[col].apply(ast.literal_eval)
    
"""

"\n\nimport ast\ncols =['Cast', 'Genre', 'Studios', 'ListOfCertificate','Keywords', 'Languages', 'Countries']\nfor col in cols:\n    a[col]=a[col].apply(ast.literal_eval)\n    \n"


                    def get_unique_certificates(dataframe):
    certificates = set()
    for i in dataframe['ListOfCertificate']:
        for y in i:
            certificates.add(y)
    return certificates


                    get_unique_certificates(a)

{'G', 'GP', 'M', 'M/PG', 'NC-17', 'PG', 'PG-13', 'R', 'X'}


                    def update_certificates(certificates):
    new_certificates = set()
    type1 = ['M', 'GP', 'M/PG']
    type2 = ['X']
    for i in certificates:
        if i in type1:
            new_certificates.add('PG')
        elif i in type2:
            new_certificates.add('NC-17')
        else:
            new_certificates.add(i)
    return list(new_certificates)


                    a['ListOfCertificate'] = a['ListOfCertificate'].apply(update_certificates)
get_unique_certificates(a)

{'G', 'NC-17', 'PG', 'PG-13', 'R'}


                    def parseWithMoneyAndCount(dataframe, col_name):
    result = []
    count = []
    gross = []
    for i, record in enumerate(dataframe[col_name]):
        for x in record:
            # Save results to corresponding array
            result.append(x)
            gross.append(dataframe['Gross_worldwide'][i])
            count.append(1)
    # Make dataframe
    t = pd.DataFrame({col_name: result, 'Money': gross, 'Count': count})
    # Remove duplicates and sum corresponding columns
    result = t.groupby(col_name).sum()
    # Rearrange dataframe
    sort_by_money = result.sort_values('Money', ascending=False)
    return sort_by_money


                    def convertTime(time):
    time = str(time)
    # Loại bỏ khoảng trắng
    time = time.replace(" ", "")
    # nếu trong chuỗi có chứa 'hour' -> có 2 khả năng là chuỗi có chứa 'hours' hoặc 'hour' 
    if "hour" in time:
        hours = 0
        # Nếu chuỗi chứa 'hours', loại bỏ 'hours' và lấy ký tự đầu tiên nhân với 60
        if "hours" in time:
            hours = int(time[0]) * 60
            after_eliminate_hour = time.replace("hours", "")
        else:
            # Nếu không thì nghĩa là ký tự đầu tiên bằng 1 => 60 phút
            hours = 60
            after_eliminate_hour = time.replace("hour", "")
        minutes = 0
        # Sau đó lấy số phút đó cộng với số phút nếu có đằng sau 
        if "minute" in after_eliminate_hour:
            if "minutes" in after_eliminate_hour:
                minutes = int(after_eliminate_hour.replace("minutes", "")[1:])
            else:
                minutes = 1
        return int(hours) + int(minutes)
    # Nếu chuỗi không chứa 'hour' -> chỉ cần loại bỏ chuỗi 'minutes' rồi chuyển về int là xong
    else:
        return int(time.replace("minutes", ""))


                    a['Runtime'] = a['Runtime'].apply(convertTime)


                    a['Runtime'].value_counts()

100    259
97     258
96     245
101    244
95     236
      ...
64       1
288      1
207      1
60       1
194      1
Name: Runtime, Length: 160, dtype: int64


                    a['Release_Data'] = pd.to_datetime(a['Release_Data'], format='%Y-%m-%d')
a['Release_Year'] = a['Release_Data'].apply(lambda x: x.year)
a['Release_Month'] = a['Release_Data'].apply(lambda x: x.month)
a['Release_Day'] = a['Release_Data'].apply(lambda x: x.day)


                    def convertRatingCount(rating_count):
    rate = str(rating_count)
    if 'M' in rate:
        return int(float(rate.replace("M", "")) * 1000000)
    elif 'K' in rate:
        return int(float(rate.replace("K", "")) * 1000)
    else:
        return int(rate)


                    # Đưa dữ liệu cột rating_count về integer
a['Rating_Count'] = a['Rating_Count'].apply(convertRatingCount)
# Đưa dữ liệu cột rating về dạng float
a['Rating'] = a['Rating'].astype(float)


                    ## Find Budget 0
indexNZero = list(a[a['Budget'] != 0].index)
indexZero = list(a[a['Budget'] == 0].index)
min = np.min(list(a.iloc[indexNZero]['Budget']))
a['Budget'] = a['Budget'].apply(lambda x: min if x == 0 else x)
## Find Runtime 0
indexNZero = list(a[a['Runtime'] != 0].index)
indexZero = list(a[a['Runtime'] == 0].index)
mean = np.mean(list(a.iloc[indexNZero]['Runtime']))
a['Runtime'] = a['Runtime'].apply(lambda x: mean if x == 0 else x)


                    a.to_csv("../dataset/processed/cleaned_data.csv", index=False)

	Movie_Title	Movie_ID	Budget	Cast	Crew	Studios	Genre	Keywords	Languages	Countries	Release_Data	Runtime	Gross_worldwide	Rating	Rating_Count	ListOfCertificate
0	Star Wars: Episode VII - The Force Awakens	2488496	$245,000,000	Daisy Ridley,John Boyega,Oscar Isaac,Domhnall ...	Lawrence Kasdan,Michael Arndt,J.J. Abrams	Lucasfilm,Bad Robot,Truenorth Productions	Action,Adventure,Sci-Fi	reboot,sanitation employee,remake,crash landin...	English	United States	2015-12-18	2 hours 18 minutes	$2,069,521,700	7.8	893K	PG-13
1	Frozen II	4520988	$150,000,000	Kristen Bell,Idina Menzel,Josh Gad,Jonathan Gr...	Jennifer Lee,Hans Christian Andersen,Chris Buck	Walt Disney Animation Studios,Walt Disney Pict...	Animation,Adventure,Comedy,Family,Fantasy,Musical	autumn,anthropomorphic snowman,princess,disney...	English	United States	2019-11-22	1 hour 43 minutes	$1,450,026,933	6.8	156K	PG
2	The Dark Knight Rises	1345836	$250,000,000	Christian Bale,Tom Hardy,Anne Hathaway,Gary Ol...	Jonathan Nolan,Christopher Nolan,David S. Goyer	Warner Bros.,Legendary Entertainment,DC Entert...	Action,Crime,Drama	dc comics,batman character,bruce wayne charact...	English,Arabic	United Kingdom,United States	2012-07-27	2 hours 44 minutes	$1,081,142,612	8.4	1.6M	PG-13
3	Beauty and the Beast	2771200	$160,000,000	Emma Watson,Dan Stevens,Luke Evans,Josh Gad,Ke...	Evan Spiliotopoulos,Bill Condon,Stephen Chbosk...	Mandeville Films,Walt Disney Pictures	Adventure,Family,Fantasy,Musical,Romance	beast,fairy tale,heroine,beast's heart,remake ...	English	United States	2017-03-17	2 hours 9 minutes	$1,273,576,220	7.1	293K	PG
4	Finding Dory	2277860	$200,000,000	Ellen DeGeneres,Albert Brooks,Ed O'Neill,Kaitl...	Angus MacLane,Victoria Strouse,Andrew Stanton	Pixar Animation Studios,Walt Disney Pictures	Animation,Adventure,Comedy,Family	fish,ocean,whale,octopus driving a truck,talki...	English,Indonesian	United States	2016-06-17	1 hour 37 minutes	$1,028,570,942	7.3	259K	PG

Attribute	Fill with
`Budget`	$0
`Runtime`	0
`Rating`	5.0
`Rating_Count`	0K
`ListOfCertificate`	G

	Movie_Title	Movie_ID	Budget	Cast	Crew	Studios	Genre	Keywords	Languages	Countries	Release_Data	Runtime	Gross_worldwide	Rating	Rating_Count	ListOfCertificate
0	Star Wars: Episode VII - The Force Awakens	2488496	245000000	Daisy Ridley,John Boyega,Oscar Isaac,Domhnall ...	Lawrence Kasdan,Michael Arndt,J.J. Abrams	Lucasfilm,Bad Robot,Truenorth Productions	Action,Adventure,Sci-Fi	reboot,sanitation employee,remake,crash landin...	English	United States	2015-12-18	2 hours 18 minutes	2069521700	7.8	893K	PG-13
1	Frozen II	4520988	150000000	Kristen Bell,Idina Menzel,Josh Gad,Jonathan Gr...	Jennifer Lee,Hans Christian Andersen,Chris Buck	Walt Disney Animation Studios,Walt Disney Pict...	Animation,Adventure,Comedy,Family,Fantasy,Musical	autumn,anthropomorphic snowman,princess,disney...	English	United States	2019-11-22	1 hour 43 minutes	1450026933	6.8	156K	PG
2	The Dark Knight Rises	1345836	250000000	Christian Bale,Tom Hardy,Anne Hathaway,Gary Ol...	Jonathan Nolan,Christopher Nolan,David S. Goyer	Warner Bros.,Legendary Entertainment,DC Entert...	Action,Crime,Drama	dc comics,batman character,bruce wayne charact...	English,Arabic	United Kingdom,United States	2012-07-27	2 hours 44 minutes	1081142612	8.4	1.6M	PG-13
3	Beauty and the Beast	2771200	160000000	Emma Watson,Dan Stevens,Luke Evans,Josh Gad,Ke...	Evan Spiliotopoulos,Bill Condon,Stephen Chbosk...	Mandeville Films,Walt Disney Pictures	Adventure,Family,Fantasy,Musical,Romance	beast,fairy tale,heroine,beast's heart,remake ...	English	United States	2017-03-17	2 hours 9 minutes	1273576220	7.1	293K	PG
4	Finding Dory	2277860	200000000	Ellen DeGeneres,Albert Brooks,Ed O'Neill,Kaitl...	Angus MacLane,Victoria Strouse,Andrew Stanton	Pixar Animation Studios,Walt Disney Pictures	Animation,Adventure,Comedy,Family	fish,ocean,whale,octopus driving a truck,talki...	English,Indonesian	United States	2016-06-17	1 hour 37 minutes	1028570942	7.3	259K	PG

Attribute	Fill with
`Budget`	Min
`Runtime`	Mean

Data Cleaning and Preprocessing¶

Missing Values¶

Currency Attributes¶

Multivalued Attributes¶

Multivalued attributes with gross and count¶

Chuẩn hóa lại dữ liệu dạng thời gian¶

1. Dữ liệu cột runtime dạng giờ-phút¶

2. Dữ liệu cột release_date dạng ngày tháng năm¶

Xử lý dữ liệu cột Rating và Rating_Count¶

Xử lý trường có giá trị 0¶

Xuất ra file CSV¶