import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt

# ---------------- Paths --------------------------------------------------------
OUT = "out"
os.makedirs(OUT, exist_ok=True)

# ---------------- Data ---------------------------------------------------------
years = np.array([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021, 2023, 2024])
# Pew Research Center Mobile Fact Sheet (accessed 2025‑06‑26)
adoption = np.array([35, 45, 55, 64, 68, 72, 77, 81, 81, 85, 88, 90])

train_mask = years <= 2019
test_mask = ~train_mask

# ---------------- Richards model (Modules 2 + 5) ------------------------------
def richards(t, L, k, t0, nu):
    """Richards (generalised logistic) curve."""
    return L / (1 + np.exp(-k * (t - t0))) ** (1 / nu)

p0 = [100, 0.4, 2014, 0.8]
bounds = ([80, 0, 2010, 0.2], [120, 5, 2020, 5])

params, cov = opt.curve_fit(
    richards, years[train_mask], adoption[train_mask],
    p0=p0, bounds=bounds, maxfev=20000
)
se = np.sqrt(np.diag(cov))
param_names = ["L", "k", "t0", "nu"]
df_params = pd.DataFrame({"Parameter": param_names, "Estimate": params, "StdError": se})
df_params.to_csv(f"{OUT}/richards_params.csv", index=False)

# correlation matrix
corr = cov / np.outer(se, se)
df_corr = pd.DataFrame(corr, index=param_names, columns=param_names)
df_corr.to_csv(f"{OUT}/param_corr.csv")

# ---------------- Rolling‑origin CV -------------------------------------------
records = []
for T in range(2015, 2020):
    mask_tr = years < T
    mask_te = years == T
    popt, _ = opt.curve_fit(
        richards, years[mask_tr], adoption[mask_tr],
        p0=p0, bounds=bounds, maxfev=10000
    )
    pred = richards(years[mask_te], *popt)[0]
    obs = adoption[mask_te][0]
    records.append({"TrainEnd": T - 1, "PredYear": T, "Pred": pred, "Obs": obs, "AbsErr": abs(pred - obs)})

df_cv = pd.DataFrame(records)
rmse = np.sqrt(((df_cv["Pred"] - df_cv["Obs"]) ** 2).mean())
df_cv.loc[len(df_cv)] = ["—", "RMSE", "", "", rmse]
df_cv.to_csv(f"{OUT}/rolling_cv.csv", index=False)

# ---------------- Plot deterministic fit --------------------------------------
t_plot = np.linspace(years.min(), years.max(), 400)
plt.figure(figsize=(7,4))
plt.scatter(years, adoption, marker="x", label="Data")
plt.plot(t_plot, richards(t_plot, *params), label="Richards fit", linewidth=2)
plt.axvline(2019.5, color="grey", linestyle=":")
plt.xlabel("Year"); plt.ylabel("Smartphone ownership (%)")
plt.title("Deterministic Richards fit")
plt.legend(); plt.tight_layout()
plt.savefig(f"{OUT}/richards_fit.png", dpi=300)
plt.close()

# ---------------- Stochastic Module‑8 envelope --------------------------------
L_, k_, t0_, nu_ = params
residuals = adoption[train_mask] - richards(years[train_mask], *params)
denom = 2 * adoption[train_mask] * (1 - adoption[train_mask] / L_)
D_hat = max(np.mean((residuals ** 2) / denom), 1e-6)

n_sims = 20000
years_full = np.arange(years.min(), years.max() + 1)
dt = 1.0
idx_start = np.where(years_full == 2019)[0][0]
sim = np.zeros((n_sims, len(years_full)))
sim[:, :idx_start+1] = richards(years_full[:idx_start+1], *params)

for i in range(idx_start+1, len(years_full)):
    x_prev = sim[:, i-1]
    drift = k_ * (1 - x_prev / L_) * x_prev
    sigma = np.sqrt(2 * D_hat * x_prev * (1 - x_prev / L_))
    sim[:, i] = np.clip(x_prev + drift*dt + sigma*np.random.normal(size=n_sims)*np.sqrt(dt), 0, L_)

lower = np.percentile(sim, 2.5, axis=0)
upper = np.percentile(sim, 97.5, axis=0)
mean = sim.mean(axis=0)

plt.figure(figsize=(7,4))
plt.fill_between(years_full, lower, upper, color="orange", alpha=0.3, label="95% band")
plt.plot(years_full, mean, color="orange", label="Mean")
plt.scatter(years, adoption, color="black", marker="x", label="Data", zorder=5)
plt.axvline(2019.5, color="grey", linestyle=":")
plt.xlabel("Year"); plt.ylabel("Smartphone ownership (%)")
plt.title("Module‑8 stochastic envelope")
plt.legend(); plt.tight_layout()
plt.savefig(f"{OUT}/stochastic_band.png", dpi=300)
plt.close()

print("Analysis complete. Outputs in 'out/'")