In [1]:
import statsmodels.formula.api as smf
import warnings
from IPython.display import HTML
from transforms import transforms, DummyVarsCfg
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv("../data/w2-saq.csv")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
df = transforms(df, drop_temp=False, dummy_cfg=None)
In [2]:
for y_variable, y_inflect, y_log_scale in [
('pricesold', None, True),
('intsize', 125, True),
('hcost', 0.34, False),
]:
display(HTML(f"""
<h1>Comparing <font color='blue'>Total Income</font> with <font color='red'>{y_variable}</font></h1>
"""))
y_min, y_max = df[y_variable].min(), df[y_variable].max()
for g_variable, columns in [
('PType_name', 5),
('location_name', 5),
('bedrooms', 5),
('Carspace', 5),
('bathroom', 5),
]:
display(HTML(f"""
<h2>
By <font color='green'>{g_variable}</font> compare
<font color='blue'>Total Income</font> Vs
<font color='red'>{y_variable}</font>
</h2>
"""))
fig, axes = plt.subplots(3, columns, figsize=(30, 18))
for row, col, ss, (ci_l, ci_h), title in [
(row, col, ss2, ci, f"{oo} {g_variable}={n}")
for col, (n, ss1) in enumerate(df.groupby(g_variable, observed=True))
for row, (oo, ss2, ci) in enumerate([
('Occupier & Investor', ss1, (59_000, 60_000)),
('Investor', ss1[ss1['owneroc'] == 0], (59_000, 60_000)),
('Occupier', ss1[ss1['owneroc'] == 1], (59_000, 64_000)),
])
]:
axes[row][col].set_title(title)
if row == 2:
axes[2][col].set_xlabel('Total Income (Log Scale)')
if col == 0:
y_label = f'{y_variable} (Log Scale)' if y_log_scale else y_variable
axes[row][0].set_ylabel(y_label)
mn, mx = df['Tincome'].min(), df['Tincome'].max()
ssl, ssu = ss[(ss['Tincome'] <= ci_l)], ss[(ss['Tincome'] > ci_l)]
axes[row][col].plot([ci_h, ci_h], [y_min, y_max], color='b', linestyle=(0, (1, 2)), label='X Inflection End')
axes[row][col].plot([ci_l, ci_l], [y_min, y_max], color='r', linestyle=(0, (2, 1)), label='X Inflection Start')
if y_inflect is None:
y_max_stress = ss[ss['Tincome'] < ci_h][y_variable].max()
y_inflect_coors = [y_max_stress, y_max_stress]
else:
y_inflect_coors = [y_inflect, y_inflect]
axes[row][col].plot([mn, mx], y_inflect_coors, color='y', linestyle=(0, (2, 1)), label='Y Inflection')
plot_kwargs = { 'alpha': 0.5, 'marker': 'o' }
axes[row][col].scatter(ssl['Tincome'], ssl[y_variable], c='r', **plot_kwargs)
axes[row][col].scatter(ssu['Tincome'], ssu[y_variable], c='g', **plot_kwargs)
axes[row][col].legend(loc='upper right')
axes[row][col].set_xscale('log')
if y_log_scale:
axes[row][col].set_yscale('log')
fig.suptitle(f"Income V {y_variable} by {g_variable}", fontsize=16)
plt.savefig(f"./figures/income-v-hcost-by-oo-{g_variable}.png")
plt.show()
display(HTML(f"""
<h2>
<font color='green'>{g_variable}</font> Heatmap of <font color='red'>{y_variable}</font>
</h2>
"""))
aggregates = [
('var', 'var'),
('mean', 'mean'),
('count', 'count'),
('p5', lambda x: x.quantile(0.05)),
('median', 'median'),
('p95', lambda x: x.quantile(0.95))
]
m_summary = df.groupby(by=['owneroc_name', g_variable, 'low_income'], observed=True)[y_variable].agg(aggregates)
fig, axes = plt.subplots(1, 2, figsize=(20, 5))
for i, (low_income, sdf) in enumerate(df.groupby('low_income', observed=True)):
summary = sdf.groupby(by=['owneroc_name', g_variable], observed=True)[y_variable].agg(aggregates)
normalized = (summary - m_summary.min()) / (m_summary.max() - m_summary.min())
normalized['count'] = normalized.min().min()
sns.heatmap(normalized, annot=summary, cmap='viridis', ax=axes[i])
axes[i].set_title('low income' if low_income else 'higher income')
plt.savefig(f"./figures/income-v-{y_variable}-by-oo-{g_variable}-heatmap.png")
plt.show()