"""
Air fares before and after 9/11
=====================================

This is a business-intelligence (BI) like application.

What is interesting here is that we may want to study fares as a function
of the year, paired accordingly to the trips, or forgetting the year,
only as a function of the trip endpoints.

Using statsmodels' linear models, we find that both with an OLS (ordinary
least square) and a robust fit, the intercept and the slope are
significantly non-zero: the air fares have decreased between 2000 and
2001, and their dependence on distance travelled has also decreased

"""

# Standard library imports
import os

##############################################################################
# Load the data
import pandas
import requests

if not os.path.exists("airfares.txt"):
    # Download the file if it is not present
    r = requests.get(
        "https://users.stat.ufl.edu/~winner/data/airq4.dat",
        verify=False,  # Wouldn't normally do this, but this site's certificate
        # is not yet distributed
    )
    with open("airfares.txt", "wb") as f:
        f.write(r.content)

# As a separator, ' +' is a regular expression that means 'one of more
# space'
data = pandas.read_csv(
    "airfares.txt",
    delim_whitespace=True,
    header=0,
    names=[
        "city1",
        "city2",
        "pop1",
        "pop2",
        "dist",
        "fare_2000",
        "nb_passengers_2000",
        "fare_2001",
        "nb_passengers_2001",
    ],
)

# we log-transform the number of passengers
import numpy as np

data["nb_passengers_2000"] = np.log10(data["nb_passengers_2000"])
data["nb_passengers_2001"] = np.log10(data["nb_passengers_2001"])

##############################################################################
# Make a dataframe with the year as an attribute, instead of separate columns

# This involves a small danse in which we separate the dataframes in 2,
# one for year 2000, and one for 2001, before concatenating again.

# Make an index of each flight
data_flat = data.reset_index()

data_2000 = data_flat[
    ["city1", "city2", "pop1", "pop2", "dist", "fare_2000", "nb_passengers_2000"]
]
# Rename the columns
data_2000.columns = pandas.Index(
    ["city1", "city2", "pop1", "pop2", "dist", "fare", "nb_passengers"]
)
# Add a column with the year
data_2000.insert(0, "year", 2000)

data_2001 = data_flat[
    ["city1", "city2", "pop1", "pop2", "dist", "fare_2001", "nb_passengers_2001"]
]
# Rename the columns
data_2001.columns = pandas.Index(
    ["city1", "city2", "pop1", "pop2", "dist", "fare", "nb_passengers"]
)
# Add a column with the year
data_2001.insert(0, "year", 2001)

data_flat = pandas.concat([data_2000, data_2001])


##############################################################################
# Plot scatter matrices highlighting different aspects

import seaborn

seaborn.pairplot(
    data_flat, vars=["fare", "dist", "nb_passengers"], kind="reg", markers="."
)

# A second plot, to show the effect of the year (ie the 9/11 effect)
seaborn.pairplot(
    data_flat,
    vars=["fare", "dist", "nb_passengers"],
    kind="reg",
    hue="year",
    markers=".",
)


##############################################################################
# Plot the difference in fare

import matplotlib.pyplot as plt

plt.figure(figsize=(5, 2))
seaborn.boxplot(data.fare_2001 - data.fare_2000)
plt.title("Fare: 2001 - 2000")
plt.subplots_adjust()

plt.figure(figsize=(5, 2))
seaborn.boxplot(data.nb_passengers_2001 - data.nb_passengers_2000)
plt.title("NB passengers: 2001 - 2000")
plt.subplots_adjust()


##############################################################################
# Statistical testing: dependence of fare on distance and number of
# passengers
import statsmodels.formula.api as sm

result = sm.ols(formula="fare ~ 1 + dist + nb_passengers", data=data_flat).fit()
print(result.summary())

# Using a robust fit
result = sm.rlm(formula="fare ~ 1 + dist + nb_passengers", data=data_flat).fit()
print(result.summary())


##############################################################################
# Statistical testing: regression of fare on distance: 2001/2000 difference

result = sm.ols(formula="fare_2001 - fare_2000 ~ 1 + dist", data=data).fit()
print(result.summary())

# Plot the corresponding regression
data["fare_difference"] = data["fare_2001"] - data["fare_2000"]
seaborn.lmplot(x="dist", y="fare_difference", data=data)

plt.show()
