import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path

BASE = Path(__file__).resolve().parent
smoking_csv = Path('/mnt/data/smoking_prevalence_who_75DDA77_ALL_LATEST.csv')
life_csv = Path('/mnt/data/life_expectancy_who_90E2E48_ALL_LATEST.csv')

# Load WHO datasets
s = pd.read_csv(smoking_csv)
le = pd.read_csv(life_csv)

# Coerce years to numeric
s["DIM_TIME"] = pd.to_numeric(s["DIM_TIME"], errors="coerce")
le["DIM_TIME"] = pd.to_numeric(le["DIM_TIME"], errors="coerce")

# Keep TOTAL (total) and country rows only
s = s[(s['DIM_SEX'] == 'TOTAL') & (s['DIM_GEO_CODE_TYPE'] == 'COUNTRY')]
le = le[(le['DIM_SEX'] == 'TOTAL') & (le['DIM_GEO_CODE_TYPE'] == 'COUNTRY')]

# Smoking prevalence: use 2025 (modelled/predicted as per WHO metadata)
# Some datasets include 2025; if not, fallback to most recent year available.
if (s['DIM_TIME'] == 2025).any():
    s_y = s[s['DIM_TIME'] == 2025].copy()
    smoking_year = 2025
else:
    smoking_year = int(s['DIM_TIME'].max())
    s_y = s[s['DIM_TIME'] == smoking_year].copy()

# Clean prevalence
s_y['smoking_prevalence_pct'] = pd.to_numeric(s_y['PERCENT_POP_N'], errors='coerce')

# Rank top 100
rank = (
    s_y[['DIM_GEO_CODE_M49','GEO_NAME_SHORT','smoking_prevalence_pct']]
    .dropna(subset=['smoking_prevalence_pct'])
    .sort_values('smoking_prevalence_pct', ascending=False)
    .reset_index(drop=True)
)
rank['rank'] = np.arange(1, len(rank) + 1)

# Keep Top 100 (or fewer if dataset smaller)
top100 = rank.head(100).copy()
cols = ['rank','GEO_NAME_SHORT','smoking_prevalence_pct']
top100_out = top100[cols].rename(columns={
    'GEO_NAME_SHORT':'country',
    'smoking_prevalence_pct':'adult_smoking_prevalence_pct'
})

# Top 20
Top20 = top100_out.head(20).copy()

# Save CSVs
(top100_out).to_csv(BASE / 'table_top100_smoking_prevalence_2025.csv', index=False)
(Top20).to_csv(BASE / 'table_top20_smoking_prevalence_2025.csv', index=False)

# Save HTML tables (lightweight, no CSS)
def df_to_min_html(df: pd.DataFrame, title: str) -> str:
    # Ensure consistent rounding
    df2 = df.copy()
    if 'adult_smoking_prevalence_pct' in df2.columns:
        df2['adult_smoking_prevalence_pct'] = df2['adult_smoking_prevalence_pct'].map(lambda x: f"{x:.1f}")
    html = [f"<h3>{title}</h3>", '<table border="1" cellspacing="0" cellpadding="6">']
    html.append('<thead><tr>' + ''.join(f"<th>{c}</th>" for c in df2.columns) + '</tr></thead>')
    html.append('<tbody>')
    for _, row in df2.iterrows():
        html.append('<tr>' + ''.join(f"<td>{row[c]}</td>" for c in df2.columns) + '</tr>')
    html.append('</tbody></table>')
    return '\n'.join(html)

(BASE / 'table_top100_smoking_prevalence_2025.html').write_text(df_to_min_html(top100_out, f"Top 100 Countries by Adult Smoking Prevalence (%), {smoking_year}"), encoding='utf-8')
(BASE / 'table_top20_smoking_prevalence_2025.html').write_text(df_to_min_html(Top20, f"Top 20 (for chart), {smoking_year}"), encoding='utf-8')

# Chart 1: Top 20 bar (horizontal for readability)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
# Reverse for barh so #1 appears on top
Top20_plot = Top20.sort_values('adult_smoking_prevalence_pct', ascending=True)
ax.barh(Top20_plot['country'], Top20_plot['adult_smoking_prevalence_pct'])
ax.set_xlabel('Adult smoking prevalence (%)')
ax.set_ylabel('Country')
ax.set_title(f'Top 20 Countries by Adult Smoking Prevalence (%), {smoking_year}')
ax.grid(True, axis='x', linestyle='--', linewidth=0.5, alpha=0.6)
fig.tight_layout()
fig.savefig(BASE / 'chart_top20_smoking_prevalence_2025.png', dpi=200)
plt.close(fig)

# Smoking vs life expectancy: pick life expectancy 2019 (latest in dataset)
life_year = 2019 if (le['DIM_TIME'] == 2019).any() else int(le['DIM_TIME'].max())
le_y = le[le['DIM_TIME'] == life_year].copy()
le_y['life_expectancy_years'] = pd.to_numeric(le_y['AMOUNT_N'], errors='coerce')

# Merge on M49 code (DIM_GEO_CODE_M49)
merged = (
    s_y[['DIM_GEO_CODE_M49','GEO_NAME_SHORT','smoking_prevalence_pct']]
    .merge(
        le_y[['DIM_GEO_CODE_M49','life_expectancy_years']],
        on='DIM_GEO_CODE_M49',
        how='inner'
    )
    .dropna(subset=['smoking_prevalence_pct','life_expectancy_years'])
)
merged = merged.rename(columns={'GEO_NAME_SHORT':'country'})
merged_out = merged[['country','smoking_prevalence_pct','life_expectancy_years']].copy()
merged_out.to_csv(BASE / 'dataset_smoking_vs_life_expectancy.csv', index=False)

# Compute correlation and simple linear regression
x = merged_out['smoking_prevalence_pct'].to_numpy()
y = merged_out['life_expectancy_years'].to_numpy()
if len(x) >= 3:
    r = float(np.corrcoef(x, y)[0,1])
    # Fit y = a + b*x
    b, a = np.polyfit(x, y, 1)
else:
    r, a, b = float('nan'), float('nan'), float('nan')

# Save stats
(BASE / 'stats_smoking_vs_life_expectancy.txt').write_text(
    '\n'.join([
        f"smoking_year={smoking_year}",
        f"life_expectancy_year={life_year}",
        f"n_countries_merged={len(merged_out)}",
        f"pearson_r={r:.4f}" if np.isfinite(r) else "pearson_r=NA",
        f"regression_intercept_a={a:.6f}" if np.isfinite(a) else "regression_intercept_a=NA",
        f"regression_slope_b={b:.6f}" if np.isfinite(b) else "regression_slope_b=NA",
        "model: life_expectancy_years = a + b * smoking_prevalence_pct",
    ]),
    encoding='utf-8'
)

# Chart 2: scatter with regression line
fig = plt.figure(figsize=(9.5, 7))
ax = fig.add_subplot(111)
ax.scatter(merged_out['smoking_prevalence_pct'], merged_out['life_expectancy_years'], s=18)
ax.set_xlabel('Adult smoking prevalence (%)')
ax.set_ylabel('Life expectancy at birth (years)')
ax.set_title(f'Smoking prevalence ({smoking_year}) vs life expectancy ({life_year})')
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)

if np.isfinite(a) and np.isfinite(b):
    x_line = np.linspace(float(np.nanmin(x)), float(np.nanmax(x)), 200)
    y_line = a + b * x_line
    ax.plot(x_line, y_line, linewidth=1.5)
    ax.text(0.02, 0.02, f"r = {r:.2f}", transform=ax.transAxes)

fig.tight_layout()
fig.savefig(BASE / 'chart_smoking_vs_life_expectancy.png', dpi=200)
plt.close(fig)

# README + sources
readme = f"""Smoking prevalence 2025 — asset pack

Contents
- table_top100_smoking_prevalence_2025.csv / .html
- table_top20_smoking_prevalence_2025.csv / .html
- chart_top20_smoking_prevalence_2025.png
- dataset_smoking_vs_life_expectancy.csv
- chart_smoking_vs_life_expectancy.png
- stats_smoking_vs_life_expectancy.txt

Notes
- Smoking prevalence: WHO Data Hub indicator 75DDA77 (code M_Est_tob_curr_std), TOTAL, country-level. Year used: {smoking_year}.
- Life expectancy: WHO Data Hub indicator 90E2E48 (code WHOSIS_000001), TOTAL, country-level. Year used: {life_year}.
- The scatter uses matched countries present in BOTH datasets.

"""
(BASE / 'README.txt').write_text(readme, encoding='utf-8')

sources = """Primary sources (WHO)
1) Smoking prevalence indicator page (WHO Data Hub): https://data.who.int/indicators/i/847662C/75DDA77
2) Smoking prevalence dataset CSV (WHO Data Hub): https://srhdpeuwpubsa.blob.core.windows.net/whdh/DATADOT/INDICATOR/75DDA77_ALL_LATEST.csv
3) Life expectancy indicator page (WHO Data Hub): https://data.who.int/indicators/i/A21CFC2/90E2E48
4) Life expectancy dataset CSV (WHO Data Hub): https://srhdpeuwpubsa.blob.core.windows.net/whdh/DATADOT/INDICATOR/90E2E48_ALL_LATEST.csv

"""
(BASE / 'SOURCES.txt').write_text(sources, encoding='utf-8')

print('OK')
