This dataset contains a list of video games with sales greater than 100,000 copies.
In this project I will deal only with exploratory analysis, where the objective is to understand how the data is distributed and generate insight for future decision-making, this analysis aims to explore as much as possible the data in a simple, intuitive and informative way. The data used in this project contains information only from 1980 to 2016.
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
See many other pandas options here
data = pd.read_csv("../data/vgsales.csv")
data.head()
data.info()
ax = sns.countplot(y='Genre',
data=data,
order=data.Genre.value_counts().index)
ax.set_title('Top Game Genre')
Action is the most popular genre followed by Sports
plt.figure(figsize=(10,10))
sns.countplot(y='Year', data=data).set_title('Game releases by year')
# lets get the five years with max game production
max_gp = data.Year.value_counts().nlargest(5)
max_gp
## lets try using pandas and matplotlib
(data
.loc[data.Year.isin(max_gp.index),['Year','Genre']]
.groupby(['Year','Genre'])
.Genre
.count()
.unstack()
.plot.bar(figsize=(30,10), title='Distribution of genres for five max production years')
)
# with seaborn this type of plot is very easy to make
plt.figure(figsize=(30,10))
sns.countplot(x='Year',
data=data,
order=data.Year.value_counts().nlargest(5).index.sort_values(),
hue='Genre').set_title('Distribution of genres for five max production years')
Nothing interesting in particular, except that Action is the dominant genre
## lets find the global sales for each genre per year
sales_by_year = (data
.groupby(by=['Year','Genre'])
.Global_Sales
.sum()
.reset_index())
## Lets find the max global sales per year
sales_by_year['Max_Global_Sales'] = (sales_by_year
.groupby(['Year'])
.Global_Sales
.transform(max)
)
## Lets filter out the most selling genre per year
max_sales_by_year = (sales_by_year
.loc[sales_by_year.Global_Sales == sales_by_year.Max_Global_Sales]
.drop(columns=['Max_Global_Sales'])
.reset_index(drop=True)
)
max_sales_by_year.head()
plt.figure(figsize=(30,10))
ax = sns.barplot(x='Year',
y='Global_Sales',
hue='Genre',
palette="Set2",
dodge=False,
data=max_sales_by_year)
ax.set_title('Highest selling genres by year, globally')
## adding annotaions on bars
for index in range(0,max_sales_by_year.shape[0]):
ax.text(index, max_sales_by_year.Global_Sales[index]+1,
str(max_sales_by_year.Genre[index] + '---' +
str(round(max_sales_by_year.Global_Sales[index],2))),
rotation=90)
Action has dominated the market in the last 15 years or so.
df_plat_sales = (data
.groupby('Platform')
.Global_Sales.sum()
.sort_values(ascending=False)
.reset_index()
)
plt.figure(figsize=(15,10))
sns.barplot(x='Global_Sales',
y='Platform',
data=df_plat_sales).set(title='Global Sales by Platform')
## it seems like the data set is already sorted by Global_Sales
data.loc[:19,['Name','Global_Sales']]
data.Global_Sales.head(20).plot()
# however the games seem to repeat, maybe due to be re-released on different plaform or regions?
data.Name.value_counts()
# so lets group them together and calculate the total global_sales before plotting
df_name_sales = (data
.groupby('Name')
.Global_Sales.sum()
.sort_values(ascending=False)
.reset_index()
.head(20)
)
plt.figure(figsize=(15,10))
sns.barplot(x='Global_Sales',
y='Name',
data=df_name_sales).set(title='Top 20 selling games globally')
Wii Sports is by far the most top selling game in the world, followed by GTA V and Super Mario Bros.
(data
.loc[:,['NA_Sales','EU_Sales','JP_Sales','Other_Sales']]
.sum()
.reset_index()
.rename(columns={'index':'region',0:'sales'})
.set_index('region')
.plot.pie(y='sales', startangle=270,figsize=(10,8),
autopct='%.1f%%', title="Sales by region")
)
8.1. Distribution of genres for top five producing years
plt.figure(figsize=(25,10))
sns.countplot(x='Genre',
data=data.loc[data.Year.isin(data.Year.value_counts().head(5).index),:],
hue='Year',
).set_title('Distribution of genres for top five producing years')
8.2 Animate pie-chart in Task 7
# Convert data into year-wise
df1 = (data
.loc[:,['Year','NA_Sales','EU_Sales','JP_Sales','Other_Sales']]
.dropna()
.astype({'Year':int})
.groupby('Year')
.sum()
)
df1.head()
from matplotlib import animation, rc
from IPython.display import HTML, Image
def update(i):
if(df1.index.min() == df1.index[i]):
# when we have no data 1980-1980
plot,_ = ax.pie(x=np.zeros(0))
else:
ax.clear()
ax.axis('equal')
plot = (df1
.head(i)
.sum()
.plot.pie(y=df1.columns,
startangle=270,
autopct='%.1f%%',
title="Sales by region {}-{}".format(df1.index.min(),df1.index[i]),
label='')
)
return(plot)
# equivalent to rcParams['animation.html'] = 'html5'
rc('animation', html='html5')
fig, ax = plt.subplots()
animator = animation.FuncAnimation(fig, update, frames=df1.shape[0], repeat=False)
animator