"""
Relating Gender and IQ
=======================

Going back to the brain size + IQ data, test if the VIQ of male and
female are different after removing the effect of brain size, height and
weight.

Notice that here 'Gender' is a categorical value. As it is a non-float
data type, statsmodels is able to automatically infer this.

"""

import pandas
from statsmodels.formula.api import ols

data = pandas.read_csv("../brain_size.csv", sep=";", na_values=".")

model = ols("VIQ ~ Gender + MRI_Count + Height", data).fit()
print(model.summary())

# Here, we don't need to define a contrast, as we are testing a single
# coefficient of our model, and not a combination of coefficients.
# However, defining a contrast, which would then be a 'unit contrast',
# will give us the same results
print(model.f_test([0, 1, 0, 0]))

###############################################################################
# Here we plot a scatter matrix to get intuitions on our results.
# This goes beyond what was asked in the exercise

# This plotting is useful to get an intuitions on the relationships between
# our different variables

from pandas import plotting
import matplotlib.pyplot as plt

# Fill in the missing values for Height for plotting
data["Height"] = data["Height"].ffill()

# The parameter 'c' is passed to plt.scatter and will control the color
# The same holds for parameters 'marker', 'alpha' and 'cmap', that
# control respectively the type of marker used, their transparency and
# the colormap
plotting.scatter_matrix(
    data[["VIQ", "MRI_Count", "Height"]],
    c=(data["Gender"] == "Female"),
    marker="o",
    alpha=1,
    cmap="winter",
)

fig = plt.gcf()
fig.suptitle("blue: male, green: female", size=13)

plt.show()
