India's lower house of Parliament, the Lok Sabha, has 543 seats in total. Members of Lok Sabha (House of the People) or the lower house of India's Parliament are elected by being voted upon by all adult citizens of India, from a set of candidates who stand in their respective constituencies. Every adult citizen of India can vote only in their constituency. Candidates who win the Lok Sabha elections are called 'Member of Parliament' and hold their seats for five years or until the body is dissolved by the President on the advice of the council of ministers.
There are more than 700 million voters with more than 800,000 polling stations.
The Lok Sabha election is a very complex affair as it involves a lot of factors. It is this very fact that makes it a perfect topic to analyze.
Currently there are two major parties in India, Bhartiya Janta Party(BJP) and Indian National Congress(INC).
As India is country of diversities, and each region is very different from every other region, there are a lot of regional or state parties having major influences. These parties can either support any of the alliance to make a government or can stay independent.
There are two major alliances, the NDA led by BJP and the UPA led by INC.
The candidate dataset has 15 features namely 'ST_CODE', 'State_name', 'Month', 'Year', 'PC_Number', 'PC_name', 'PC_Type', 'Candidate_Name', 'Candidate_Sex', 'Candidate_Category', 'Candidate_Age', 'Party_Abbreviation', 'Total_Votes_Polled', 'Position','Alliance'.
The elector dataset consist of 8 features namely 'STATE CODE', 'STATE', 'PC NO', 'PARLIAMENTARY CONSTITUENCY','Total voters', 'Total_Electors', 'TOT_CONTESTANT', 'POLL PERCENTAGE'.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# Read both the datasets
electors_2009 = pd.read_csv('../data/LS2009Electors.csv')
# quick check for any null values
electors_2009.isnull().sum()
candidate_2009 = pd.read_csv("../data/candidate09.csv")
# see the shape
candidate_2009.shape
# Finding the value counts of both the genders
# Plotting a bar graph
candidate_2009.Candidate_Sex.value_counts().plot(kind='bar',rot=0, title='Gender comparison 2009 elections')
plt.xticks(ticks=(0,1), labels=('Male Candidates', 'Female Candidates'))
plt.ylabel('No. of candidates')
plt.xlabel('Gender')
plt.show()
# Selecting the subset of the data with winner candidates
# looking at their summary statistics
pd.DataFrame({'winner':candidate_2009[candidate_2009.Position == 1].Candidate_Age.describe(),
'all':candidate_2009.Candidate_Age.describe()})
# Lets plot the winners
winner = candidate_2009[candidate_2009.Position == 1].Candidate_Age
winner.plot.hist(bins=25, title='Winner Candidates')
plt.xlabel('Age of the Candidates')
plt.ylabel('Number of Candidates')
plt.show()
# Lets plot them side by side
# Histogram of the age of all the candidates
fig,ax = plt.subplots(nrows=1,ncols=2,tight_layout = True)
ax[0].hist(list(candidate_2009.Candidate_Age),bins = 25)
ax[0].set_xlabel('Age of the Candidates')
ax[0].set_ylabel('Number of Candidates')
ax[0].set_title('All the Candidates')
ax[1].hist(list(winner),bins = 25,color = 'red')
ax[1].set_xlabel('Age of the Candidates')
ax[1].set_ylabel('Number of Candidates')
ax[1].set_title('Winner Candidates')
plt.show()
# How about we overlap these histogram to see the proportions
bins =25
plt.figure(figsize=(8,8))
candidate_2009.Candidate_Age.plot.hist(bins=bins,label='all')
candidate_2009[candidate_2009.Position == 1].Candidate_Age.plot.hist(bins=bins,label='winners')
plt.legend(loc='upper right')
plt.title('Age comparison winners vs all')
plt.xlabel('Age')
plt.ylabel('No of candidates')
plt.show()
# Lets see the features required for this plot
candidate_2009[['Party_Abbreviation', 'Total_Votes_Polled']]
# Group the dataframe by 'Party_Abbreviation' and sum the 'Total _Votes_Polled'
# Plot the vote share of top 10 parties
candidate_2009.groupby('Party_Abbreviation')['Total_Votes_Polled'].\
sum().sort_values(ascending=False)[:10].\
plot.bar(rot=0, title='Vote shares')
plt.ylabel('No of votes in millions')
plt.xlabel('Party')
plt.show()
## Adapted from https://stackoverflow.com/a/56780852/8210613 to show values on the bars
def show_values_on_bars(axs, h_v="v", xspace=0.4, yspace=0.4, unit='%'):
def _show_on_single_plot(ax):
if h_v == "v":
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height()
value = str(round(float(p.get_height()),2)) + unit
ax.text(_x, _y, value, ha="center")
elif h_v == "h":
for p in ax.patches:
_x = p.get_x() + p.get_width() + float(xspace)
_y = p.get_y() + p.get_height() + float(yspace)
value = str(round(float(p.get_width()),2)) + unit
ax.text(_x, _y, value, ha="left")
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
import seaborn as sns
# Mean POLL PERCENTAGE of all the STATES
polls = electors_2009.groupby('STATE')['POLL PERCENTAGE'].mean().sort_values(ascending=False)
# Generating a bar plot
plt.figure(figsize=(6,20))
sns_t = sns.barplot(polls,polls.index)
show_values_on_bars(sns_t, "h", -10.2,-0.3 ,'%')
# Find winners in UP and count the party affiliation
ax = candidate_2009[ (candidate_2009.Position == 1 ) & ( candidate_2009.State_name == 'Uttar Pradesh') ].\
Party_Abbreviation.value_counts().plot.barh(title='Winning Seats distributions by party in UP')
# to show in descending order
ax.invert_yaxis()
plt.xlabel('No. of seats')
plt.ylabel('Party')
# [optional] to save the figure to be included in external reports
plt.savefig('up.png')
Alliances
in Gujarat, Madhya Pradesh and Maharashtra.¶# Subset the the dataset for the states of Gujarat, Maharashtra and Madhya Pradesh
states_list = ['Gujarat', 'Madhya Pradesh', 'Maharashtra']
states = candidate_2009[candidate_2009.State_name.isin(states_list)][candidate_2009.Position ==1]
# Stacked bar plot
states.groupby(['State_name', 'Alliance']).size().unstack().\
plot.bar(stacked=True,figsize=(10,10),rot=0,
title='Alliance-wise seat distributions in three states')
plt.xlabel('States')
plt.ylabel('No. of seats')
plt.show()
# Subset the data with the winner of each constituency of the mentioned states
states_list = ['Andhra Pradesh', 'Kerala', 'Tamil Nadu', 'Karnataka']
states = candidate_2009[candidate_2009.State_name.isin(states_list)][candidate_2009.Position ==1]
# Plotting the grouped bar
states.groupby(['Alliance', 'Candidate_Category']).size().unstack().\
plot.bar(figsize=(8,8),rot=0, title ="2009 Winning Category")
plt.xlabel("Party-wise Candidate Category", fontsize=12)
plt.ylabel("No. of seats")
plt.show()
# But if we remove GEN category and only focus on SC, ST we might see a different picture
# Plotting the grouped bar
states[states.Candidate_Category!='GEN'].\
groupby(['Alliance', 'Candidate_Category']).size().unstack().\
plot.bar(figsize=(8,8),rot=0, title ="2009 Winning Category excl. GEN",
color=['tab:orange','tab:green'])
plt.xlabel("Party-wise Candidate Category", fontsize=12)
plt.ylabel("No. of seats")
plt.show()
# Constituency with less than 100000 voters
# Plot a horizontal bar graph to compare constituencies with less than 100000 voters
electors_2009[electors_2009.Total_Electors < 100000].\
sort_values(ascending=True,by='Total_Electors').\
plot.barh(x='PARLIAMENTARY CONSTITUENCY',y='Total_Electors',
figsize=(10,10),
title="Parliamentary constituencies with less than 1 lakh total voters")
plt.xlabel('Number of Voters')
plt.ylabel('Parliamentary Constituencies')
plt.show()
# Candidates with 1st position in their respective constituiency
all_winners = candidate_2009[candidate_2009.Position ==1].Party_Abbreviation.value_counts()
top_10_winners = all_winners[:9]
# count of other regional parties
top_10_winners['Others'] = all_winners.sum() - top_10_winners.sum()
# Pie chart
top_10_winners.plot.pie(autopct='%.f%%',
figsize=(10,10),
title='Top 10 parties with majority seats')
plt.legend(loc='upper right')
plt.ylabel('')
plt.show()
# Top 9 states with maximum number of seats
top_10_seats = electors_2009.STATE.value_counts()[:9]
# Sum of other states
top_10_seats['Others'] = electors_2009.STATE.value_counts().sum() - top_10_seats.sum()
# Function to convert percentages into actual values
def autopct_format(values):
def my_format(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{val:d} ({pct:.0f}%)'.format(val=val,pct=pct)
return my_format
# PLotting the pie chart
top_10_seats.plot.pie(autopct=autopct_format(top_10_seats.values),
figsize=(10,10),
title="Top 10 States with most number of seats")
plt.ylabel('')
plt.show()