In [1]:
import statsmodels.formula.api as smf
import warnings
from IPython.display import HTML
from transforms import transforms, DummyVarsCfg
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv("../data/w2-saq.csv")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
df = transforms(df, drop_temp=False, dummy_cfg=None)
In [2]:
for y_variable, y_inflect, y_log_scale in [
('pricesold', None, True),
('intsize', 125, True),
('hcost', 0.34, False),
]:
display(HTML(f"""
<h1>Comparing <font color='blue'>Total Income</font> with <font color='red'>{y_variable}</font></h1>
"""))
y_min, y_max = df[y_variable].min(), df[y_variable].max()
for g_variable, columns in [
('PType_name', 5),
('location_name', 5),
('bedrooms', 5),
('Carspace', 5),
('bathroom', 5),
]:
display(HTML(f"""
<h2>
By <font color='green'>{g_variable}</font> compare
<font color='blue'>Total Income</font> Vs
<font color='red'>{y_variable}</font>
</h2>
"""))
fig, axes = plt.subplots(3, columns, figsize=(30, 18))
for row, col, ss, (ci_l, ci_h), title in [
(row, col, ss2, ci, f"{oo} {g_variable}={n}")
for col, (n, ss1) in enumerate(df.groupby(g_variable, observed=True))
for row, (oo, ss2, ci) in enumerate([
('Occupier & Investor', ss1, (59_000, 60_000)),
('Investor', ss1[ss1['owneroc'] == 0], (59_000, 60_000)),
('Occupier', ss1[ss1['owneroc'] == 1], (59_000, 64_000)),
])
]:
axes[row][col].set_title(title)
if row == 2:
axes[2][col].set_xlabel('Total Income (Log Scale)')
if col == 0:
y_label = f'{y_variable} (Log Scale)' if y_log_scale else y_variable
axes[row][0].set_ylabel(y_label)
mn, mx = df['Tincome'].min(), df['Tincome'].max()
ssl, ssu = ss[(ss['Tincome'] <= ci_l)], ss[(ss['Tincome'] > ci_l)]
axes[row][col].plot([ci_h, ci_h], [y_min, y_max], color='b', linestyle=(0, (1, 2)), label='X Inflection End')
axes[row][col].plot([ci_l, ci_l], [y_min, y_max], color='r', linestyle=(0, (2, 1)), label='X Inflection Start')
if y_inflect is None:
y_max_stress = ss[ss['Tincome'] < ci_h][y_variable].max()
y_inflect_coors = [y_max_stress, y_max_stress]
else:
y_inflect_coors = [y_inflect, y_inflect]
axes[row][col].plot([mn, mx], y_inflect_coors, color='y', linestyle=(0, (2, 1)), label='Y Inflection')
plot_kwargs = { 'alpha': 0.5, 'marker': 'o' }
axes[row][col].scatter(ssl['Tincome'], ssl[y_variable], c='r', **plot_kwargs)
axes[row][col].scatter(ssu['Tincome'], ssu[y_variable], c='g', **plot_kwargs)
axes[row][col].legend(loc='upper right')
axes[row][col].set_xscale('log')
if y_log_scale:
axes[row][col].set_yscale('log')
fig.suptitle(f"Income V {y_variable} by {g_variable}", fontsize=16)
plt.savefig(f"./figures/income-v-hcost-by-oo-{g_variable}.png")
plt.show()
display(HTML(f"""
<h2>
<font color='green'>{g_variable}</font> Heatmap of <font color='red'>{y_variable}</font>
</h2>
"""))
aggregates = [
('var', 'var'),
('mean', 'mean'),
('count', 'count'),
('p5', lambda x: x.quantile(0.05)),
('median', 'median'),
('p95', lambda x: x.quantile(0.95))
]
m_summary = df.groupby(by=['owneroc_name', g_variable, 'low_income'], observed=True)[y_variable].agg(aggregates)
fig, axes = plt.subplots(1, 2, figsize=(20, 5))
for i, (low_income, sdf) in enumerate(df.groupby('low_income', observed=True)):
summary = sdf.groupby(by=['owneroc_name', g_variable], observed=True)[y_variable].agg(aggregates)
normalized = (summary - m_summary.min()) / (m_summary.max() - m_summary.min())
normalized['count'] = normalized.min().min()
sns.heatmap(normalized, annot=summary, cmap='viridis', ax=axes[i])
axes[i].set_title('low income' if low_income else 'higher income')
plt.savefig(f"./figures/income-v-{y_variable}-by-oo-{g_variable}-heatmap.png")
plt.show()
Comparing Total Income with pricesold
By PType_name, compare Total Income Vs pricesold
PType_name Heatmap, of pricesold
By location_name, compare Total Income Vs pricesold
location_name Heatmap, of pricesold
By bedrooms, compare Total Income Vs pricesold
bedrooms Heatmap, of pricesold
By Carspace, compare Total Income Vs pricesold
Carspace Heatmap, of pricesold
By bathroom, compare Total Income Vs pricesold
bathroom Heatmap, of pricesold
Comparing Total Income with intsize
By PType_name, compare Total Income Vs intsize
PType_name Heatmap, of intsize
By location_name, compare Total Income Vs intsize
location_name Heatmap, of intsize
By bedrooms, compare Total Income Vs intsize
bedrooms Heatmap, of intsize
By Carspace, compare Total Income Vs intsize
Carspace Heatmap, of intsize
By bathroom, compare Total Income Vs intsize
bathroom Heatmap, of intsize
Comparing Total Income with hcost
By PType_name, compare Total Income Vs hcost
PType_name Heatmap, of hcost
By location_name, compare Total Income Vs hcost
location_name Heatmap, of hcost
By bedrooms, compare Total Income Vs hcost
bedrooms Heatmap, of hcost
By Carspace, compare Total Income Vs hcost
Carspace Heatmap, of hcost
By bathroom, compare Total Income Vs hcost
bathroom Heatmap, of hcost
In [ ]: