# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style='white');
%matplotlib inline
# load dataset into a pandas dataframe
loan_df = pd.read_csv('data/prosperLoanData.csv')
loan_df.shape
loan_df.head(5)
loan_df.info()
# Filter loan dataframe based on the features of interest.
filtered_loan_df = loan_df[[
'Term', 'LoanStatus','BorrowerAPR','BorrowerRate',
'EstimatedLoss','EstimatedReturn','ProsperRating (Alpha)',
'ProsperScore','ListingCategory (numeric)','BorrowerState','EmploymentStatusDuration','DebtToIncomeRatio',
'StatedMonthlyIncome','TotalProsperLoans','TotalProsperPaymentsBilled','LoanOriginalAmount',
'LoanOriginationDate'
]]
filtered_loan_df.head()
# Get Loan Data that Originated after July 2009
# This is because some of the variables of interest are applicable to data
# that comes after July 2009. i.e. ProsperRating, ProsperScore, EstimatedReturn, EstimatedLoss, etc
loans_after_july_2009 = filtered_loan_df.query('LoanOriginationDate > "2009-07"')
# rename ListingCategory (numeric) to ListingCategory
# rename ProsperRating (Alpha) to ProsperRating
loans_after_july_2009 = loans_after_july_2009.rename(columns={"ListingCategory (numeric)": "ListingCategory",
"ProsperRating (Alpha)":"ProsperRating"})
# Replace the integer values with the name values for the ListingCategory
loans_after_july_2009['ListingCategory'] = loans_after_july_2009['ListingCategory'].replace({
0: 'Not Available',
1: 'Debt Consolidation',
2: 'Home Improvement',
3: 'Business',
4: 'Personal Loan',
5: 'Student Use',
6: 'Auto',
7: 'Other',
8: 'Baby & Adoption',
9: 'Boat',
10: 'Cosmetic Procedure',
11: 'Engagement Ring',
12: 'Green Loans',
13: 'Household Expenses',
14: 'Large Purchases',
15: 'Medical/Dental',
16: 'Motorcycle',
17: 'RV',
18: 'Taxes',
19: 'Vacation',
20: 'Wedding Loans'
})
# Categorical Datatype Conversions
loans_after_july_2009['ListingCategory'] = loans_after_july_2009['ListingCategory'].astype('category')
loans_after_july_2009['LoanStatus'] = loans_after_july_2009['LoanStatus'].astype('category')
# Remove data without ProsperScore(s)
loans_after_july_2009 = loans_after_july_2009[loans_after_july_2009['ProsperScore'].notna()]
# Change ProsperScore to String.
loans_after_july_2009['ProsperScore'] = loans_after_july_2009['ProsperScore'].astype(int).astype(str)
# Change Term to Str and then to Category type
loans_after_july_2009['Term'] = loans_after_july_2009['Term'].astype(str)
# Ordered Categorical Data for some variables
# Convert Term, ProsperScore, and ProsperRating into ordered categorical types
ordinal_var_dict = {
'Term': ['12', '36','60'],
'ProsperScore': ['1','2','3 ','4','5','6','7','8','9','10','11'],
'ProsperRating': ['HR', 'E', 'D', 'C', 'B', 'A', 'AA']
}
for var in ordinal_var_dict:
ordered_var = pd.api.types.CategoricalDtype(ordered = True,
categories = ordinal_var_dict[var])
loans_after_july_2009[var] = loans_after_july_2009[var].astype(ordered_var)
loans_after_july_2009.describe()
default_colour = sns.color_palette()[0]
# Dictionary of Numerical Variables and Some properties to be used in plotting their distributions
numerical_variables = {
'BorrowerAPR': {'binsize': 0.01, 'x_label': 'Borrower\'s Annual Percentage Rate'},
'BorrowerRate': {'binsize': 0.005, 'x_label': 'Borrower\'s Interest Rate'},
'EstimatedLoss': {'binsize': 0.005, 'x_label': 'Estimated Loss'},
'EstimatedReturn': {'binsize': 0.005, 'x_label': 'Estimated Returns'},
'StatedMonthlyIncome': {'binsize': 10000, 'x_label': 'Monthly Income'},
'EmploymentStatusDuration': {'binsize': 10, 'x_label': 'Duration of Employment'},
'DebtToIncomeRatio': {'binsize': 0.1, 'x_label': 'Debt to Income Ratio'},
'TotalProsperLoans': {'binsize': 0.4, 'x_label': 'Total Prosper Loans'},
'TotalProsperPaymentsBilled': {'binsize': 3, 'x_label': 'Total Prosper Payments Billed'},
'LoanOriginalAmount': {'binsize': 1000, 'x_label': 'Loan Original Amount'},
}
# Helper function to plot distributions
def plot_distribution(data, binsize, x_label):
bins = np.arange(data.min(), data.max()+binsize, binsize)
plt.hist(data,bins=bins)
plt.xlabel(x_label)
plt.show();
# Plot distribution of numerical variables.
for var in numerical_variables:
plot_distribution(loans_after_july_2009[var],
numerical_variables[var]['binsize'],
numerical_variables[var]['x_label']
)
# DebtToIncomeRatio
bins = np.arange(loans_after_july_2009['DebtToIncomeRatio'].min(),
loans_after_july_2009['DebtToIncomeRatio'].max()+0.02, 0.02)
plt.hist(loans_after_july_2009['DebtToIncomeRatio'],bins=bins)
plt.xlabel('Debt To Income Ratio');
plt.xlim(0,0.8);
# MonthlyIncome has a Huge Outlier of 1750000
# Zoom into plot to get a clearer distribution
bins = np.arange(loans_after_july_2009['StatedMonthlyIncome'].min(),
loans_after_july_2009['StatedMonthlyIncome'].max()+750, 750)
plt.hist(loans_after_july_2009['StatedMonthlyIncome'], bins=bins)
plt.xlabel('Monthly Income');
plt.xlim(0,25000);
# LoanTerm
default_colour = sns.color_palette()[0]
sns.countplot(data=loans_after_july_2009, x='Term', color=default_colour);
plt.figure(figsize=(10,5))
sns.countplot(data=loans_after_july_2009, x='LoanStatus', color=default_colour);
plt.xticks(rotation=80);
plt.figure(figsize=(10,5))
sns.countplot(data=loans_after_july_2009, x='ProsperRating', color=default_colour);
plt.figure(figsize=(10,5))
sns.countplot(data=loans_after_july_2009, x='ProsperScore', color=default_colour);
plt.figure(figsize=(15,25))
sns.countplot(data=loans_after_july_2009, y='BorrowerState', color=default_colour);
plt.figure(figsize=(15,10))
sns.countplot(data=loans_after_july_2009, x='ListingCategory', color=default_colour);
plt.xticks(rotation=80);
# Correlation Plot for All Numerical Variables
plt.figure(figsize=[15,10])
ax = sns.heatmap(loans_after_july_2009[numerical_variables].corr(), annot = True, fmt = '.3f',
cmap = 'vlag_r', center = 0)
ax.set_ylim(len(numerical_variables),0)
plt.show()
# plot matrix: sample 500 loan_data so that plots are clearer and they render faster
num_var = ['BorrowerRate','LoanOriginalAmount','DebtToIncomeRatio','StatedMonthlyIncome',
'EstimatedReturn','EstimatedLoss','BorrowerAPR']
samples = np.random.choice(loans_after_july_2009.shape[0], 500, replace = False)
loans_sample = loans_after_july_2009.loc[samples,:]
g = sns.PairGrid(data = loans_sample, vars = num_var);
g = g.map_diag(plt.hist, edgecolor='w', bins=20);
g.map_offdiag(plt.scatter, edgecolor='w', s=30);
# BorrowerRate + BorrowerAPR
plt.figure(figsize=(10,5))
sns.scatterplot(data=loans_after_july_2009, x='BorrowerRate', y="BorrowerAPR", alpha=1/10);
# It's understable that the BorrowerRate and BorrowerAPR have a high positive correlation
# The BorrowerRate is also a component of the Borrower's Annual Percentage Rate (APR)
# BorrowerRate + Loan Original Amount
plt.figure(figsize=(10,5))
sns.scatterplot(data=loans_after_july_2009, x='BorrowerRate', y="LoanOriginalAmount", alpha=1/30);
# Higher Loan Amount didn't necessarily have a high Interest Rate.
# It appears Higher Loan Amounts have a smaller BorrowerRate (Interest Rate)
# BorrowerRate + Estimated Return
plt.figure(figsize=(10,5))
sns.scatterplot(data=loans_after_july_2009, x='BorrowerRate', y="EstimatedReturn", alpha=1/10);
# BorrowerRate + EstimatedLoss
plt.figure(figsize=(10,5))
sns.scatterplot(data=loans_after_july_2009, x='BorrowerRate', y="EstimatedLoss",alpha=1/10);
# BorrowerRate + DebtToIncomeRatio
plt.figure(figsize=(10,5))
sns.scatterplot(data=loans_after_july_2009, x='BorrowerRate', y="DebtToIncomeRatio", alpha=1/10);
plt.ylim(0,0.8);
# There was no clear correlation between the BorrowerRate and the DebtToIncomeRatio.
# However what we observe is that most of the higher DebtToIncomeRatio values have a BorrowerRate that
# ranges from approximately 0.31 to 0.33.
# BorrowerRate + LoanStatus
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009,y='BorrowerRate',x='LoanStatus',color=default_colour,inner='quartile');
plt.xticks(rotation=80);
# Most of the loans that were chargeof had high interest rates folowed by Defaulted Loans
# The Completed Loans have a fair distribution across the quartiles
# Most of the Current Loans have a lesser Interest Rate.
# The remaining have a fairly equal distribution of Interest Rates.
# BorrowerRate + ListingCategory
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009, y='BorrowerRate', x='ListingCategory', color=default_colour,
inner='quartile');
plt.xticks(rotation=90);
# Considering the Categories that recorded the highest values in our Univariate Analysis of the Listing Category
# most of the DebtConsolidation Category had a smaller BorrowerRate, below 0.2 which happens to be the
# approximate Median we observe. Business, Home Improvement and Auto Loans had a fairly uniform distribution
# though the Upper Quartiles have a higher BorrowerRate.
# BorrowerRate + ProsperRating
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009, y='BorrowerRate', x='ProsperRating', color=default_colour,
inner="quartile");
# We observe that the BorrowerRate (Interest Rates) decrease as we move from HR to AA across the
# Prosper Ratings
# BorrowerRate + ProsperScore
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009, y='BorrowerRate', x='ProsperScore', color=default_colour,
inner="quartile");
# Loans with high risk score have higher BorrowerRates as we see with ProsperScore of 1 and 2. Lower Risk Scores
# of 9,10,11 record lower interest rates. The BorrowerRates generally decline from ProsperScore of 1 to 11
# BorrowerRate + Term
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009, y='BorrowerRate', x='Term', color=default_colour,
inner="quartile");
# Loans taken for 12months have a lesser Interest Rate
# Loans taken for 36 and 60 months have almost the same Median Interest rates but
# the 36 month Loan term generally have higher Interest Rate
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009, x='LoanStatus', y='LoanOriginalAmount',
inner="quartile", color=default_colour);
plt.xticks(rotation=45);
# Most of the loans in the ChargedOff, Completed and Defaulted Status had Smaller LoanOriginalAmount
# We have to find out why these small loans couldn't be paid off but had to be Defaulted or ChargedOff.
plt.figure(figsize=(15,10))
sns.boxplot(data=loans_after_july_2009, x='LoanStatus', y='StatedMonthlyIncome',
color=default_colour);
plt.xticks(rotation=45);
plt.ylim(0,60000);
# The Median Monthly Income was almost the same across the various LoanStatus.
# ChargedOff, Completed and Current LoanStatus recorded most of the higher Monthly Income.
# ChargedOff, Defaulted and Past Due (>120 days) Loans had lesser Median Monthly Income.
plt.figure(figsize=(15,5))
ax = sns.countplot(data=loans_after_july_2009, x='LoanStatus', hue='ListingCategory', color=default_colour);
ax.legend(loc = 1, ncol = 2);
plt.xticks(rotation=75);
# ListingCategories of top Loan Status
plt.figure(figsize=(15,5))
ax = sns.countplot(x='LoanStatus', hue='ListingCategory',data=loans_after_july_2009, color=default_colour,
order=loans_after_july_2009['LoanStatus'].value_counts().iloc[:3].index);
ax.legend(loc = 1, ncol = 2);
plt.figure(figsize=(15,5))
sns.countplot(data=loans_after_july_2009, x='LoanStatus', hue='Term', color=default_colour);
plt.xticks(rotation=75);
# The Chargedoff, Completed and Current Loans all recorded a 36month Term for most of the Loans followed by
# The 60month and 12month Term Respectively.
# Terms of top Loan Status
sns.countplot(x='LoanStatus', hue='Term',data=loans_after_july_2009, color=default_colour,
order=loans_after_july_2009['LoanStatus'].value_counts().iloc[:3].index);
plt.figure(figsize=(15,5))
sns.countplot(data=loans_after_july_2009, x='LoanStatus', hue='ProsperRating', color=default_colour);
plt.xticks(rotation=75);
# Most of the Current Loans were rated, **C** and **D** for Completed Loans. The highest for Chargedoff Loans
# was **D** followed by **HR** but with a small difference.
# ProsperRating of top LoanStatus
plt.figure(figsize=(15,5))
ax = sns.countplot(x='LoanStatus', hue='ProsperRating',data=loans_after_july_2009, color=default_colour,
order=loans_after_july_2009['LoanStatus'].value_counts().iloc[:3].index);
plt.figure(figsize=(15,7))
sns.countplot(data=loans_after_july_2009, x='LoanStatus', hue='ProsperScore', color=default_colour);
plt.xticks(rotation=75);
# Chargedoff Loans appeared to have a normal distribution across the ProsperScore.
# Most Completed Loans were rated 9 while most Current Loans were rated 4.
# ProsperScore of top LoanStatus
plt.figure(figsize=(15,5))
ax = sns.countplot(x='LoanStatus', hue='ProsperScore',data=loans_after_july_2009, color=default_colour,
order=loans_after_july_2009['LoanStatus'].value_counts().iloc[:3].index);
plt.figure(figsize=(15,10))
sns.violinplot(data=loans_after_july_2009, x='LoanStatus', y='DebtToIncomeRatio', color=default_colour,
inner="quartile");
plt.ylim(-0.6,1.0);
plt.xticks(rotation=90);
# We observe a median DebtToIncomeRatio across the various Loan Status.
plt.figure(figsize=(20,10))
sns.violinplot(data=loans_after_july_2009, x='LoanStatus', y='EstimatedReturn', color=default_colour,
inner="quartile");
plt.xticks(rotation=90);
# The Chargedoff and Defaulted Loans has a higher Estimated Return. The Current Loans have a smaller Estimated
# Return. The Completed and FinalPaymentInProgress Loans have a fairly uniform Distibution of EstimatedReturn.
# The remaining categories have an equal distribution of the EstimatedReturn.
plt.figure(figsize=(20,8))
sns.violinplot(data=loans_after_july_2009, x='LoanStatus', y='EstimatedLoss', color=default_colour,
inner="quartile");
plt.xticks(rotation=80);
# The Current Loans had a greater number belonging to the lower quartiles of EstimatedLoss. Chargedoff and
# Defaulted Loans have a greater number belonging to the upper quartiles. The remaining share a fairly
# Similar distribution.
# LoanOriginalAmount, LoanStatus, BorrowerRates
# plt.figure(figsize=[20,10])
g = sns.FacetGrid(data = loans_after_july_2009, col = 'LoanStatus', height = 6, col_wrap=3)
g.map(plt.scatter, 'BorrowerRate', 'LoanOriginalAmount', alpha=0.35);
# For **Completed and Current Loans** we observe that the high LoanOriginalAmounts have a smaller
# BorrowerRates. The remaining Listing Status appear to have higher BorrowerRates. As observed early on,
# Most of the LoanOriginalAmounts are below 15,000 USD.
# ListingCategory, LoanStatus, BorrowerRates
g = sns.FacetGrid(data = loans_after_july_2009, col = 'ListingCategory', height = 6, col_wrap=3)
g.map(sns.boxplot, 'BorrowerRate', 'LoanStatus');
# The **Debt Consolidation, Other, Home Improvement, Business and Auto** Listings have a highest BorrowerRate
# Median for the Chargedoff and Defaulted Loans.
# ProsperRating, LoanStatus, BorrowerRates
plt.figure(figsize=[20,10])
cat_means = loans_after_july_2009.groupby(['ProsperRating', 'LoanStatus']).mean()['BorrowerRate']
cat_means = cat_means.reset_index(name = 'borrower_rate_avg')
cat_means = cat_means.pivot(index = 'LoanStatus', columns = 'ProsperRating',
values = 'borrower_rate_avg')
ax = sns.heatmap(cat_means, annot = True, fmt = '.3f',cmap = 'vlag_r', center= 0,
cbar_kws = {'label' : 'Mean BorrowerRate'})
ax.set_ylim(11,0);
# ProsperScore, LoanStatus, BorrowerRates
plt.figure(figsize=[20,10])
cat_means = loans_after_july_2009.groupby(['ProsperScore', 'LoanStatus']).mean()['BorrowerRate']
cat_means = cat_means.reset_index(name = 'borrower_rate_avg')
cat_means = cat_means.pivot(index = 'LoanStatus', columns = 'ProsperScore',
values = 'borrower_rate_avg')
ax = sns.heatmap(cat_means, annot = True, fmt = '.3f',cmap = 'vlag_r', center= 0,
cbar_kws = {'label' : 'Mean BorrowerRate'})
ax.set_ylim(11,0);