Loading all the modules that will be used:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
df = pd.read_csv('../Data/houses_to_rent_v2.csv')
df.head()
Changing column names for the better.
df_clean = df.copy()
df_clean.columns = [col.replace('(R$)', '').strip().replace(' ', '_') for col in df_clean.columns]
df_clean.head()
Checking data types and missing values.
df_clean.info()
# Are there missing values?
df_clean.isna().sum()
Creating a function that will allow us to understand the distribution of the data.
def distbox(dist):
fig, axes = plt.subplots(2, 1, sharex=True, figsize=(12, 6))
# Histogram
sns.distplot(dist, ax=axes[0])
axes[0].set_title('Distribuition', fontsize=15)
axes[0].set_xlabel('')
# boxplot
sns.boxplot(x=dist, ax=axes[1])
axes[1].set_xlabel(dist.name.replace('_', ' ').title(), fontsize=13)
plt.tight_layout()
Is there a reason why the floor is treated as object type?
df_clean['floor'].value_counts()
Let's assume that, if it is not an apartment, then the floor is equal to 0 (zero) instead of - (dash).
df_clean.loc[df_clean['floor'] == '-', 'floor'] = 0
df_clean['floor'].value_counts()
Converting floor to an integer type will allow us to make correlations easier.
df_clean['floor'] = df_clean['floor'].astype('int64')
df_clean.info()
An overview of our data.
df_clean.describe().T
# This function helps us to identify outliers
def outliers(val):
# First quartile (Q1)
Q1 = np.percentile(val, 25, interpolation = 'midpoint')
# Third quartile (Q3)
Q3 = np.percentile(val, 75, interpolation = 'midpoint')
# Interquaritle range (IQR)
IQR = Q3 - Q1
return (Q1-1.5*IQR, Q3+1.5*IQR)
Understanding the distribution of the rent amount, our target variable.
distbox(df_clean['rent_amount'])
fig, axes = plt.subplots(df_clean['city'].nunique(), 1, sharex=True, figsize=(12, 8))
i = 0
color = ['b', 'r', 'y', 'g', 'm']
for city in df_clean['city'].unique():
sns.distplot(df_clean[df_clean['city'] == city]['rent_amount'], color=color[i], ax=axes[i])
axes[i].set_title(city + ' - Rent Amount', fontsize=13)
axes[i].set_xlabel('')
i += 1
plt.tight_layout()
fig, axes = plt.subplots(df_clean['city'].nunique(), 1, sharex=True, figsize=(12, 8))
i = 0
color = ['b', 'r', 'y', 'g', 'm']
for city in df_clean['city'].unique():
sns.boxplot(df_clean[df_clean['city'] == city]['rent_amount'], color=color[i], ax=axes[i])
axes[i].set_title(city + ' - Rent Amount', fontsize=13)
axes[i].set_xlabel('')
i += 1
plt.tight_layout()
Analysing correlation between variables and checking the features that most influence our target.
plt.figure(figsize=(10,8))
sns.heatmap(df_clean.corr(), annot=True, vmin=-1, vmax=1, cmap='coolwarm')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tight_layout()
# We will assume a correlation equal to or greater than 0.5 as influential
df_clean.corr()[df_clean.corr()['rent_amount'] >= 0.5]['rent_amount'].round(3)
Removing correlation from itself, there are four features with influential correlations with the rent amount: rooms, bathroom, parking spaces and fire insurance. Plotting those features to visualize their distribution.
distbox(df_clean['rooms'])
distbox(df_clean['bathroom'])
distbox(df_clean['parking_spaces'])
distbox(df_clean['fire_insurance'])
Fire insurance has the highest correlation, almost perfect.
plt.figure(figsize=(10,6))
sns.regplot(x='fire_insurance', y='rent_amount', data=df_clean,
line_kws={'color':'r'})
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
Plotting the least influential features.
df_clean.head()
distbox(df_clean['area'])
# Defining outliers
outliers(df_clean['area'])
# Removing outliers from the chart
distbox(df_clean[df_clean['area'] < 371.0]['area'])
distbox(df_clean['floor'])
df_clean['floor'].max()
# 301 represents the apartment number from third floor
df_clean.loc[df_clean['floor'] == 301, 'floor'] = 3
distbox(df_clean['floor'])
distbox(df_clean['hoa'])
df_clean['hoa'].sort_values(ascending=False)[:10]
# Defining outliers
outliers(df_clean['hoa'])
# Removing outliers from the chart
distbox(df_clean[df_clean['hoa'] < 2840.0]['hoa'])
df_clean.loc[df_clean['hoa'].sort_values(ascending=False)[:6].index]
distbox(df_clean['property_tax'])
# Defining outliers
outliers(df_clean['property_tax'])
# Removing outliers from the chart
distbox(df_clean[df_clean['property_tax'] < 880.5]['property_tax'])
fig, axes = plt.subplots(1, 3, figsize=(15, 8))
i = 0
for col in ['city', 'animal', 'furniture']:
axes[i].set_title(df_clean[col].name.title(), fontsize=15)
axes[i].pie(df_clean[col].value_counts(),
labels=df_clean[col].value_counts().index,
textprops = {"fontsize":13},
autopct='%1.1f%%');
i += 1
fig, axes = plt.subplots(1, 2, figsize=(15, 8))
sns.countplot(x='animal', data=df_clean, hue='city', ax=axes[0])
axes[0].tick_params(axis="x", labelsize=13)
axes[0].set_xlabel('Animal', fontsize=13)
sns.countplot(x='furniture', data=df_clean, hue='city', ax=axes[1])
axes[1].tick_params(axis="x", labelsize=13)
axes[1].set_xlabel('Furniture', fontsize=13)
X = df_clean.drop(columns='rent_amount', axis=1)
y = df_clean['rent_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
def scalerApply(model):
"""
Apply the chosen model using 4 differents scalers/transformations:
MinMaxScaler
StandardScaler
RobustScaler
PowerTransformer
Returns 3 plots and the score:
Actual values x Predicted values
Residual values (actual - predicted)
Actual distribution x Predicted distribution
"""
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler(), PowerTransformer()]
for scaler in scalers:
num_transformer = Pipeline(steps=[('scaler', scaler),
('imputer', SimpleImputer(strategy='median'))
])
cat_transformer = Pipeline(steps=[('one-hot_encoder', OneHotEncoder(drop='first'))
])
num_cols = X.dtypes[df_clean.dtypes!='object'].index
cat_cols = X.dtypes[df_clean.dtypes=='object'].index
preprocessor = ColumnTransformer(transformers=[("num_pipeline", num_transformer, num_cols),
("cat_pipeline", cat_transformer, cat_cols),
])
model_pipe = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
model_pipe.fit(X_train,y_train);
prediction = model_pipe.predict(X_test);
fig, axes = plt.subplots(1, 3, figsize=(16, 4))
sns.scatterplot(y_test, prediction, ax=axes[0]);
axes[0].set_title('Model Linearity');
# Residual Histogram
sns.distplot((y_test-prediction),bins=50, ax=axes[1]);
axes[1].set_title('Model Residue');
sns.distplot(y_test, hist=False, color='b', label ='Actual', ax=axes[2]);
sns.distplot(prediction, hist=False, color='r', label = 'Predicted', ax=axes[2]);
axes[2].set_title('Model Accuracy');
fig.suptitle(str(scaler).split(sep='(')[0], fontsize=16, y=1.1)
plt.tight_layout()
scalerApply(LinearRegression())
scalerApply(GradientBoostingRegressor())
scalerApply(DecisionTreeRegressor())
scalerApply(RandomForestRegressor())
num_transformer = Pipeline(steps=[('scaler', RobustScaler()),
('imputer', SimpleImputer(strategy='median'))
])
cat_transformer = Pipeline(steps=[('one-hot_encoder', OneHotEncoder(drop='first'))
])
num_cols = X.dtypes[df_clean.dtypes!='object'].index
cat_cols = X.dtypes[df_clean.dtypes=='object'].index
preprocessor = ColumnTransformer(transformers=[("num_pipeline", num_transformer, num_cols),
("cat_pipeline", cat_transformer, cat_cols),
])
model_pipe = Pipeline(steps=[('preprocessor', preprocessor),
('model', LinearRegression())
]);
scoring = ('r2', 'neg_mean_absolute_error', 'neg_mean_squared_error')
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=101)
scores = cross_validate(model_pipe, X, y, scoring=scoring, cv=cv, n_jobs=-1)
fig, axes = fig, axes = plt.subplots(1, 3, figsize=(20, 4))
axes[0].hist(scores['test_r2'], bins=20);
axes[0].set_title('R2', fontsize=16);
axes[1].hist(scores['test_neg_mean_absolute_error'], bins=20);
axes[1].set_title('Neg MAE', fontsize=16);
axes[2].hist(scores['test_neg_mean_squared_error'], bins=20);
axes[2].set_title('Neg MSE', fontsize=16);
print('R2: {:.3f}'.format(np.mean(scores['test_r2'])))
print('Neg MAE: {:.3f}'.format(np.mean(scores['test_neg_mean_absolute_error'])))
print('Neg MSE: {:.3f}'.format(np.mean(scores['test_neg_mean_squared_error'])))