# Importing Librarys
import requests
import pandas as pd
import numpy as np
 
# Creating Dataframe (filtering out incomplete year)
all_time = pd.read_csv("vgsales.csv")
all_time = all_time[all_time['Year'] <= 2015]


print(f"Total Observations: {all_time.count().max()}")
print(f"Total Missing Observations: {all_time.count().max()-all_time.dropna().count().max()}")
p = ((all_time.count().max()-all_time.dropna().count().max())/all_time.count().max())*100
print(f"Percentage of missing Observations: {p}")

Total Observations: 15979
Total Missing Observations: 34
Percentage of missing Observations: 0.21277927279554415


# Dropping missing data
all_time = all_time.dropna();
all_time.head()


# Summing all sales for each year
all_time.groupby('Year').sum().head()


# Importing matplotlib and plotting data
from matplotlib import pyplot as plt
plot = all_time.dropna().groupby('Year').sum().reset_index().plot(x='Year', y='Global_Sales', 
                                                                  title = "Total Video Game Sales Over Time", 
                                                                  ylabel = "Total Sales",
                                                                  figsize=(15, 5)
                                                                 );


# Displating unique platforms
all_time["Platform"].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)


relevant = ['PC', 'PS4', 'XOne', 'WiiU', 'X360', 'Wii', 'DS', 'PS2', 'PS3', 'XB', 'GC']
plt.figure(figsize=(15, 5))

# Only plotting relevant platforms
for platform in relevant:
    temp = all_time[all_time["Platform"] == platform].groupby('Year').sum().reset_index()
    temp = temp[temp['Year'] > 2000]
    plt.plot(temp['Year'], temp['Global_Sales'], label = platform)
plt.legend()
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.xlabel("Year")
plt.ylabel("Sales")
plt.title("Game Sales per Year for Different Platforms");
plt.show()


# Displaying unique genres
all_time["Genre"].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)


plt.figure(figsize=(15, 5))

# Plotting all genres
for platform in all_time["Genre"].unique():
    temp = all_time[all_time["Genre"] == platform].groupby('Year').sum().reset_index()
    plt.plot(temp['Year'], temp['Global_Sales'], label = platform)
    
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.xlabel("Year")
plt.ylabel("Sales")
plt.title("Game Sales per Year for Different Genres");
plt.show()


# URL for request
URL = "https://api.rawg.io/api/games?page_size=20000&dates=2015-01-01,2019-12-31&ordering=-added"

# Setting api key for request
PARAMS = {'key' : "b084e7f4218f49f291800dfcca6a229c"}
r = requests.get(url = URL, params = PARAMS)


# Creating new dataframe and converting data to JSON
df = pd.DataFrame();
data = r.json();

try:
    # Loop to continue grabbing data until error is thrown
    while data['next']:
        # Gathering only relevant data and placing in dataframe
        for game in data['results']:
            if len(df.index) == 0:
                df = pd.DataFrame({
                    'Title': [game['name']],
                    'Released': [game['released']],
                    'Platforms': [None] if game['platforms'] == None else 
                    [[platform['platform']['slug'] for platform in game['platforms']]],
                    'Downloads': [game['added']],
                    'Rating': [game['rating']],
                    'Tags': [None] if game['tags'] == None else 
                    [[tag['slug'] for tag in game['tags'] if tag['language'] == 'eng']],
                    'Genres': [None] if game['genres'] == None else [[genre['slug'] for genre in game['genres']]]
                })
            else:
                df = pd.concat([pd.DataFrame({
                    'Title': [game['name']],
                    'Released': [game['released']],
                    'Platforms': [None] if game['platforms'] == None else 
                    [[platform['platform']['slug'] for platform in game['platforms']]],
                    'Downloads': [game['added']],
                    'Rating': [game['rating']],
                    'Tags': [None] if game['tags'] == None else
                    [[tag['slug'] for tag in game['tags'] if tag['language'] == 'eng']],
                    'Genres': [None] if game['genres'] == None else [[genre['slug'] for genre in game['genres']]]
                }), df], ignore_index = True)
        data = requests.get(url = data['next']).json()
except KeyError:
    print("Done")
    
df.tail()

Done


print(f"Total Observations: {df.count().max()}")
print(f"Total Missing Observations: {df.count().max()-df.dropna().count().max()}")
print(f"Percentage of missing Observations: {((df.count().max()-df.dropna().count().max())/df.count().max())*100}")

Total Observations: 10000
Total Missing Observations: 1
Percentage of missing Observations: 0.01


# Dropping missing data
df = df.dropna().sample(frac=1);

# Function designed to return all unique elements of a column
def get_unique(s):
    ret = [];
    for index, value in s.items():
        ret.extend(value);
    return np.unique(np.array(ret));


unique_platforms = get_unique(df['Platforms']);
unique_tags = get_unique(df['Tags']);
unique_genres = get_unique(df['Genres']);
print(f"Unique Platforms:\n {unique_platforms}\n");
print(f"Unique Tags:\n {unique_tags}\n");
print(f"Unique Genres:\n {unique_genres}\n");

Unique Platforms:
 ['android' 'gamecube' 'genesis' 'ios' 'linux' 'macintosh' 'macos' 'nes'
 'nintendo-3ds' 'nintendo-ds' 'nintendo-switch' 'pc' 'playstation1'
 'playstation2' 'playstation3' 'playstation4' 'playstation5' 'ps-vita'
 'psp' 'sega-master-system' 'web' 'wii' 'wii-u' 'xbox-old' 'xbox-one'
 'xbox-series-x' 'xbox360']

Unique Tags:
 ['1-bit' '16-bit' '1960s' ... 'zelda-like' 'zelda-style' 'zombies']

Unique Genres:
 ['action' 'adventure' 'arcade' 'board-games' 'card' 'casual' 'educational'
 'family' 'fighting' 'indie' 'massively-multiplayer' 'platformer' 'puzzle'
 'racing' 'role-playing-games-rpg' 'shooter' 'simulation' 'sports'
 'strategy']


# Creating array of empty arrays
X = [[] for _ in range(df.count().max())]

# Looping through each column of interest
for column in ['Platforms', 'Genres']:
    unique_set = get_unique(df[column]);
    # Looping through each unique element of that column
    for unique in unique_set:
        # Adding a one to array if element present otherwise zero
        for idx, elem in enumerate(df[column]):
            if np.isin(unique, elem).any(): 
                X[idx].append(1)
            else: 
                X[idx].append(0)


# Creating copy of dataframe and converting to discrete
disc = df.copy();
disc['Rating'] = pd.qcut(disc['Rating'], q=4,  duplicates='drop', labels = ['Disliked', 'Liked'])


disc.head()


from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import accuracy_score

# Gathering data
y = disc.copy()['Rating'].to_numpy();
print(y[1])
X = np.array(X);

# Splitting data
kf = KFold(n_splits=10)
kf.get_n_splits(X)
avg_acc = 0;

# Testing and training each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Creating and training model
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    
    # Printing and saving accuracy
    print(accuracy_score(y_test, clf.predict(X_test)), end = " | ")
    avg_acc += (accuracy_score(y_test, clf.predict(X_test)));

# Printing average accuracy
print(f"\nAverage Accuracy: {100*(avg_acc/10)}%")

Disliked
0.792 | 0.797 | 0.79 | 0.802 | 0.773 | 0.786 | 0.804 | 0.791 | 0.785 | 0.7947947947947948 | 
Average Accuracy: 79.14794794794796%


# Original game in form of data
stretch = {'Platforms': ['pc'], 'Genres': ['platformer', 'puzzle', 'indie']}
stretch_cont = [];

# AAA version in form of data
new = {'Platforms': ['pc', 'xbox-one', 'playstation4', 'nintendo-switch'], 'Genres': ['platformer', 'puzzle']}
new_cont = [];

# One-Hot encoding
for column in ['Platforms', 'Genres']:
    unique_set = get_unique(df[column]);
    for unique in unique_set:
        if np.isin(unique, stretch[column]).any(): 
            stretch_cont.append(1)
        else: 
            stretch_cont.append(0)
        if np.isin(unique, new[column]).any(): 
            new_cont.append(1)
        else: 
            new_cont.append(0)


# Fitting with all data
clf = clf.fit(X, y)


# Predicting success of both games
a = clf.predict([stretch_cont, new_cont])
print(f"Result of Stretchy-Man (indie): {a[0]}");
print(f"Result of Stretchy-Man (AAA): {a[1]}");

Result of Stretchy-Man (indie): Disliked
Result of Stretchy-Man (AAA): Liked


# Creating copy of data and converting the rating to discrete once again
disc = df.copy()
disc['Rating'] = pd.cut(disc['Rating'], [0, 1, 2, 3, 4, 5], 
                        labels=['0-1', '1-2', '2-3', '3-4','4-5'],
                        include_lowest = True)

disc.head()


# Displaying bar graph
temp = disc.groupby('Rating').mean().reset_index();
plt.figure(figsize=(10, 5.5))
plt.bar(temp['Rating'], temp['Downloads'], color=['indianred', 'orange', 'yellow', 'greenyellow', 'lightgreen'])
plt.xlabel("Rating Range")
plt.ylabel("Average Downloads")
plt.title("Rating Effect on Average Downloads");
plt.show()


# Importing linear regression model from sklearn
from sklearn.linear_model import LinearRegression

# Fitting to original data
reg = LinearRegression().fit(df['Rating'].array.reshape(-1, 1), df['Downloads'])
print(f"m = {reg.coef_[0]}")
print(f"b = {reg.intercept_}")

m = 201.51801452096663
b = 58.16598706508057


plt.figure(figsize=(11, 5))

# Plotting data points
plt.scatter(df['Rating'], df['Downloads'],c="lightcoral", s=30, alpha=0.2)

# Plotting line
x = np.linspace(0,5,100)
plt.plot(x, reg.coef_[0]*x+reg.intercept_, c = "#1f77b4", linewidth=2)
plt.xlabel("Rating")
plt.ylabel("Downloads")
plt.title("Effect of Rating on Downloads");
plt.show()


import statsmodels.api as sm
# Running another regression
X = df['Rating'].array.reshape(-1, 1)
est = sm.OLS(df['Downloads'], sm.add_constant(X))
print(est.fit().summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:              Downloads   R-squared:                       0.149
Model:                            OLS   Adj. R-squared:                  0.149
Method:                 Least Squares   F-statistic:                     1757.
Date:                Mon, 16 May 2022   Prob (F-statistic):               0.00
Time:                        17:43:53   Log-Likelihood:                -81003.
No. Observations:                9999   AIC:                         1.620e+05
Df Residuals:                    9997   BIC:                         1.620e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         58.1660     10.816      5.378      0.000      36.965      79.367
x1           201.5180      4.807     41.918      0.000     192.094     210.942
==============================================================================
Omnibus:                    11818.495   Durbin-Watson:                   1.999
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1538866.816
Skew:                           6.265   Prob(JB):                         0.00
Kurtosis:                      62.470   Cond. No.                         3.35
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
0	1	Wii Sports	Wii	2006.0	Sports	Nintendo	41.49	29.02	3.77	8.46	82.74
1	2	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24
2	3	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.85	12.88	3.79	3.31	35.82
3	4	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.75	11.01	3.28	2.96	33.00
4	5	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37

	Rank	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
Year
1980.0	29826	10.59	0.67	0.00	0.12	11.38
1981.0	190488	33.40	1.96	0.00	0.32	35.77
1982.0	149186	26.92	1.65	0.00	0.31	28.86
1983.0	56759	7.76	0.80	8.10	0.14	16.79
1984.0	22911	33.28	2.10	14.27	0.70	50.36

	Title	Released	Platforms	Downloads	Rating	Tags	Genres
9995	Fallout 4	2015-11-09	[pc, xbox-one, playstation4]	10785	3.79	[singleplayer, steam-achievements, atmospheric...	[action, role-playing-games-rpg]
9996	DOOM (2016)	2016-05-13	[pc, xbox-one, playstation4, nintendo-switch]	10906	4.39	[singleplayer, multiplayer, atmospheric, great...	[shooter, action]
9997	Red Dead Redemption 2	2018-10-26	[pc, playstation4, xbox-one]	12006	4.58	[singleplayer, multiplayer, atmospheric, great...	[adventure, action]
9998	Life is Strange	2015-01-29	[pc, xbox-one, playstation4, ios, android, mac...	12544	4.11	[singleplayer, steam-trading-cards, atmospheri...	[adventure]
9999	The Witcher 3: Wild Hunt	2015-05-18	[pc, playstation5, xbox-one, playstation4, xbo...	16021	4.67	[singleplayer, atmospheric, full-controller-su...	[adventure, action, role-playing-games-rpg]

	Title	Released	Platforms	Downloads	Rating	Tags	Genres
8459	Scrap Mechanic	2016-01-19	[pc]	514	Liked	[singleplayer, multiplayer, atmospheric, steam...	[indie, adventure, action, simulation]
655	Isbarah	2015-02-25	[pc, macos, linux]	43	Disliked	[singleplayer, steam-achievements, steam-tradi...	[indie, action]
1572	Cursed Town	2018-05-25	[pc]	52	Disliked	[singleplayer, steam-achievements, rpg, story-...	[adventure, role-playing-games-rpg]
2229	Tori	2018-04-19	[pc, macos]	58	Disliked	[singleplayer]	[casual, indie, adventure]
2013	Final Warrior Quest	2018-04-12	[pc]	56	Disliked	[singleplayer, full-controller-support, rpg, c...	[indie, role-playing-games-rpg]

	Title	Released	Platforms	Downloads	Rating	Tags	Genres
8459	Scrap Mechanic	2016-01-19	[pc]	514	4-5	[singleplayer, multiplayer, atmospheric, steam...	[indie, adventure, action, simulation]
655	Isbarah	2015-02-25	[pc, macos, linux]	43	0-1	[singleplayer, steam-achievements, steam-tradi...	[indie, action]
1572	Cursed Town	2018-05-25	[pc]	52	0-1	[singleplayer, steam-achievements, rpg, story-...	[adventure, role-playing-games-rpg]
2229	Tori	2018-04-19	[pc, macos]	58	0-1	[singleplayer]	[casual, indie, adventure]
2013	Final Warrior Quest	2018-04-12	[pc]	56	0-1	[singleplayer, full-controller-support, rpg, c...	[indie, role-playing-games-rpg]

The Game Development Dream¶

Understanding the Market¶

Data Collection:¶

Data Analysis and Visualization:¶

Predicting a Games Success in Today's Market¶

Data Collection:¶

Data Preparation:¶

Model Fitting and Testing:¶

Rating vs Downloads¶

Model Fitting and Testing:¶

Conclusion¶

Sources:¶