TMDB 영화 데이터

 

Kaggle TMDB 데이터 콘텐트 기반 필터링

import pandas as pd
import ast
credit = pd.read_csv('./data/tmdb_5000_credits.csv')
movies = pd.read_csv('./data/tmdb_5000_movies.csv')

print(movies.columns)
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
movies.head()
budget genres homepage id keywords original_language original_title overview popularity production_companies production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
0 237000000 [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {... http://www.avatarmovie.com/ 19995 [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "sp... en Avatar In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ... 150.437577 [{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century Fox Film Corporatio... [{"iso_3166_1": "US", "name": "United States of America"}, {"iso_3166_1": "GB", "name": "United ... 2009-12-10 2787965087 162.0 [{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "es", "name": "Espa\u00f1ol"}] Released Enter the World of Pandora. Avatar 7.2 11800
1 300000000 [{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}] http://disney.go.com/disneypictures/pirates/ 285 [{"id": 270, "name": "ocean"}, {"id": 726, "name": "drug abuse"}, {"id": 911, "name": "exotic is... en Pirates of the Caribbean: At World's End Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t... 139.082615 [{"name": "Walt Disney Pictures", "id": 2}, {"name": "Jerry Bruckheimer Films", "id": 130}, {"na... [{"iso_3166_1": "US", "name": "United States of America"}] 2007-05-19 961000000 169.0 [{"iso_639_1": "en", "name": "English"}] Released At the end of the world, the adventure begins. Pirates of the Caribbean: At World's End 6.9 4500
2 245000000 [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 80, "name": "Crime"}] http://www.sonypictures.com/movies/spectre/ 206647 [{"id": 470, "name": "spy"}, {"id": 818, "name": "based on novel"}, {"id": 4289, "name": "secret... en Spectre A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil... 107.376788 [{"name": "Columbia Pictures", "id": 5}, {"name": "Danjaq", "id": 10761}, {"name": "B24", "id": ... [{"iso_3166_1": "GB", "name": "United Kingdom"}, {"iso_3166_1": "US", "name": "United States of ... 2015-10-26 880674609 148.0 [{"iso_639_1": "fr", "name": "Fran\u00e7ais"}, {"iso_639_1": "en", "name": "English"}, {"iso_639... Released A Plan No One Escapes Spectre 6.3 4466
3 250000000 [{"id": 28, "name": "Action"}, {"id": 80, "name": "Crime"}, {"id": 18, "name": "Drama"}, {"id": ... http://www.thedarkknightrises.com/ 49026 [{"id": 849, "name": "dc comics"}, {"id": 853, "name": "crime fighter"}, {"id": 949, "name": "te... en The Dark Knight Rises Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c... 112.312950 [{"name": "Legendary Pictures", "id": 923}, {"name": "Warner Bros.", "id": 6194}, {"name": "DC E... [{"iso_3166_1": "US", "name": "United States of America"}] 2012-07-16 1084939099 165.0 [{"iso_639_1": "en", "name": "English"}] Released The Legend Ends The Dark Knight Rises 7.6 9106
4 260000000 [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 878, "name": "Science Fic... http://movies.disney.com/john-carter 49529 [{"id": 818, "name": "based on novel"}, {"id": 839, "name": "mars"}, {"id": 1456, "name": "medal... en John Carter John Carter is a war-weary, former military captain who's inexplicably transported to the myster... 43.926995 [{"name": "Walt Disney Pictures", "id": 2}] [{"iso_3166_1": "US", "name": "United States of America"}] 2012-03-07 284139100 132.0 [{"iso_639_1": "en", "name": "English"}] Released Lost in our world, found in another. John Carter 6.1 2124
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]
movies_df.head()
id title genres vote_average vote_count popularity keywords overview
0 19995 Avatar [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {... 7.2 11800 150.437577 [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "sp... In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...
1 285 Pirates of the Caribbean: At World's End [{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}] 6.9 4500 139.082615 [{"id": 270, "name": "ocean"}, {"id": 726, "name": "drug abuse"}, {"id": 911, "name": "exotic is... Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t...
2 206647 Spectre [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 80, "name": "Crime"}] 6.3 4466 107.376788 [{"id": 470, "name": "spy"}, {"id": 818, "name": "based on novel"}, {"id": 4289, "name": "secret... A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...
3 49026 The Dark Knight Rises [{"id": 28, "name": "Action"}, {"id": 80, "name": "Crime"}, {"id": 18, "name": "Drama"}, {"id": ... 7.6 9106 112.312950 [{"id": 849, "name": "dc comics"}, {"id": 853, "name": "crime fighter"}, {"id": 949, "name": "te... Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c...
4 49529 John Carter [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 878, "name": "Science Fic... 6.1 2124 43.926995 [{"id": 818, "name": "based on novel"}, {"id": 839, "name": "mars"}, {"id": 1456, "name": "medal... John Carter is a war-weary, former military captain who's inexplicably transported to the myster...

genres, keywords는 string type으로 여러개의 데이터가 들어와 있음을 확인할 수 있으며 이는 ast 라이브러리를 통해 후처리로 변환이 가능하다.

movies_df['genres'] = movies_df['genres'].apply(ast.literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(ast.literal_eval)
movies_df.head()
id title genres vote_average vote_count popularity keywords overview
0 19995 Avatar [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {... 7.2 11800 150.437577 [{'id': 1463, 'name': 'culture clash'}, {'id': 2964, 'name': 'future'}, {'id': 3386, 'name': 'sp... In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...
1 285 Pirates of the Caribbean: At World's End [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 28, 'name': 'Action'}] 6.9 4500 139.082615 [{'id': 270, 'name': 'ocean'}, {'id': 726, 'name': 'drug abuse'}, {'id': 911, 'name': 'exotic is... Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t...
2 206647 Spectre [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 80, 'name': 'Crime'}] 6.3 4466 107.376788 [{'id': 470, 'name': 'spy'}, {'id': 818, 'name': 'based on novel'}, {'id': 4289, 'name': 'secret... A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...
3 49026 The Dark Knight Rises [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': ... 7.6 9106 112.312950 [{'id': 849, 'name': 'dc comics'}, {'id': 853, 'name': 'crime fighter'}, {'id': 949, 'name': 'te... Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's c...
4 49529 John Carter [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 878, 'name': 'Science Fic... 6.1 2124 43.926995 [{'id': 818, 'name': 'based on novel'}, {'id': 839, 'name': 'mars'}, {'id': 1456, 'name': 'medal... John Carter is a war-weary, former military captain who's inexplicably transported to the myster...

이제 genres, keywords에 대해 python 문법을 적용하여 전처리를 진행할 수 있다.

movies_df['genres'] = movies_df['genres'].apply(lambda x: [genre['name'] for genre in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [keyword['name'] for keyword in x])
movies_df[['genres', 'keywords']][:1]
genres keywords
0 [Action, Adventure, Fantasy, Science Fiction] [culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...

apply, lambda 키워드가 같이 나와서 당황스러울 수 있지만 다음과 같이 생각하면 편하다.
pandas 데이터 프레임에 대해 각 속성을 디테일하게(파이썬 문법을 적용하면서) 처리하고 싶다면 apply, lambda를 같이 사용해서 파이썬 문법을 적용해서 처리하면 되는 것이고, 그렇지 않고 전체적인 타입 변환 등과 같이 한번에 처리해줄 수 있는거라면 apply만 적용해주면 된다.

from sklearn.feature_extraction.text import CountVectorizer

# movies_df.loc[movies_df['genres'] == ['Adventure', 'Fantasy', 'Action']]['genres']
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(2, 2))      # min_df는 default로 1인데, 이걸 0으로 줬다는 것은 1번만 나온것은 카운팅 해준다는 것으로 보임
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)
(4803, 254)

CountVectorization은 등장하는 단어들을 토큰화 한 뒤, 각 단어가 현재 데이터에서(현재 영화에서) 몇번 나왔는지를 행렬로 표현한 것임.
자세한 사항은 다음 블로그 글을 참고하자

위 코드를 통해 4803개의 레코드와 276개의 개별 단어 피처로 구성된 피처 행렬이 만들어짐.
이제 코사인 유사도를 통해 각 레코드끼리의 유사도를 나타내는 행렬을 만들어 보자.

from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
genre_sim
array([[1.        , 0.35355339, 0.35355339, ..., 0.        , 0.        ,
        0.        ],
       [0.35355339, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.35355339, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
genre_sim_sorted_ind
array([[   0,  813,   46, ..., 3127, 3126, 2401],
       [  98,  129, 2343, ..., 3201, 3202, 2401],
       [1740,    2, 1542, ..., 3194, 3195, 2401],
       ...,
       [4800, 2472, 3641, ..., 3060, 3059,    0],
       [4802, 1594, 1596, ..., 3204, 3205,    0],
       [4802, 1594, 1596, ..., 3204, 3205,    0]], dtype=int64)

numpy.argsort() 함수에 들어가는 default axis=-1 이다. 따라서 위처럼 2차원 넘파이의 경우 column wise sorting 을 통한 인덱스가 나오게 된다.
또한 numpy[:, ::-1]은 row는 변화를 주지 않고, column만 슬라이싱 형태로 변화를 주겠다는 의미이며 step=-1을 준 것이므로 순서가 거꾸로 가게 된다.

따라서 위 결과는 0번 레코드와 가장 유사한 레코드는 0번, 두번째로 유사한 레코드는 3494번 레코드라는 의미이다.

def find_sim_movie(df, sorted_ind, title_name, top_n=10):

    # 인자로 입력된 movies_df DataFrame에서 'title' 칼럼이 입력된 title_name 값인 DataFrame 추출
    title_movie = df[df['title'] == title_name]

    # title_name을 가진 DataFrame의 index 객체를 ndarray로 반환하고
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :top_n]

    # 추출된 top_n index 출력. top_n index는 2차원 데이터임
    # dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

위 함수에서 reshape(-1)을 통해 similear_indexes를 1차원으로 변경해 준 것을 주목하자.
title_movie.index.values를 통해 나온 index는 정수형이 아닌 numpy 타입으로 bracket [ ] 에 둘러쌓여있다.
즉 이 numpy가 위치행렬 인덱스로 들어가서 numpy에 접근하여 슬라이싱 하는 순간, 파이썬은 해당 numpy에 여러개의 값이 들어갈 것을 대비해
2차원 형태로 슬라이싱을 하게 된다. 슬라이싱을 sorted_ind[3337, :top_n] 형태로 했으면 1차원 형태로 슬라이싱 된다.

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]
title vote_average
3594 Spring Breakers 5.0
2731 The Godfather: Part II 8.3
588 Wall Street: Money Never Sleeps 5.8
1243 Mean Streets 7.2
4217 Kids 6.8
281 American Gangster 7.4
1464 Black Water Transit 0.0
3636 Light Sleeper 5.7
1149 American Hustle 6.8
1847 GoodFellas 8.2