# Core library imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path for module imports
module_path = Path.cwd().parent
if str(module_path) not in sys.path:
    sys.path.insert(0, str(module_path))

# Data manipulation
import numpy as np
import pandas as pd

# Statistical analysis
from scipy import stats
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.stats.diagnostic import acorr_ljungbox, het_arch
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# MS-GARCH module
from data_loader import DataLoader
from utils import (
    compute_log_returns,
    validate_stationarity,
    test_heteroskedasticity,
    test_autocorrelation,
    test_normality,
    compute_realized_volatility,
    qq_plot_statistics,
)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

# Set random seed for reproducibility
np.random.seed(42)

print("✓ Environment setup complete")

✓ Environment setup complete

# Initialize data loader
loader = DataLoader(
    config_path="../configs/ms_garch_config.yaml",
    verbose=True
)

# Load multi-asset data
data = loader.load_multi_asset(
    assets=["BTC", "ETH", "SOL"],
    start_date="2022-01-01",  # Focus on recent crypto market cycles
    end_date=None,  # Use all available data
    align_timestamps=True,
    validate=True
)

print("\n" + "="*60)
print("DATA LOADING SUMMARY")
print("="*60)
for asset in ["BTC", "ETH", "SOL"]:
    print(f"\n{asset}:")
    print(f"  Period: {data[asset]['start_date']} to {data[asset]['end_date']}")
    print(f"  Observations: {data[asset]['n_observations']:,}")
    print(f"  Mean return: {data[asset]['returns'].mean():.6f}")
    print(f"  Volatility (std): {data[asset]['returns'].std():.6f}")

============================================================
Loading data for 3 assets: BTC, ETH, SOL
============================================================

Loading BTC from: BTCUSDT_BYBIT_4h_2022-01-01_2025-07-31.parquet

  Statistical Validation for BTC:
  --------------------------------------------------
  1. Stationarity (ADF): statistic=-19.9030, p-value=0.0000 ✓ STATIONARY
  2. ARCH Effects: LM-statistic=367.2011, p-value=0.0000 ✓ ARCH EFFECTS PRESENT
  3. Autocorrelation (Ljung-Box): statistic=66.3702, p-value=0.0000
  4. Normality (Jarque-Bera): statistic=17342.9232, p-value=0.0000 ✗ NON-NORMAL (expected for crypto)
  5. Distribution: skew=-0.098, excess_kurtosis=7.289 ✓ FAT TAILS
  --------------------------------------------------

  Loaded 7842 observations from 2022-01-01 00:00:00 to 2025-07-30 20:00:00
  Return statistics: mean=0.000118, std=0.011020, skew=-0.098, kurt=7.289
Loading ETH from: ETHUSDT_BYBIT_4h_2022-01-01_2025-07-31.parquet

  Statistical Validation for ETH:
  --------------------------------------------------
  1. Stationarity (ADF): statistic=-18.0997, p-value=0.0000 ✓ STATIONARY
  2. ARCH Effects: LM-statistic=454.5257, p-value=0.0000 ✓ ARCH EFFECTS PRESENT
  3. Autocorrelation (Ljung-Box): statistic=63.8320, p-value=0.0000
  4. Normality (Jarque-Bera): statistic=25098.3486, p-value=0.0000 ✗ NON-NORMAL (expected for crypto)
  5. Distribution: skew=-0.347, excess_kurtosis=8.744 ✓ FAT TAILS
  --------------------------------------------------

  Loaded 7842 observations from 2022-01-01 00:00:00 to 2025-07-30 20:00:00
  Return statistics: mean=0.000003, std=0.014614, skew=-0.347, kurt=8.744
Loading SOL from: SOLUSDT_BYBIT_4h_2022-01-01_2025-07-31.parquet
  WARNING: 4 potential outliers detected in SOL (returns > 20.0%)
  First outlier dates: [Timestamp('2022-11-09 12:00:00'), Timestamp('2022-11-10 00:00:00'), Timestamp('2022-11-10 12:00:00'), Timestamp('2023-01-14 00:00:00')]

  Statistical Validation for SOL:
  --------------------------------------------------
  1. Stationarity (ADF): statistic=-37.0029, p-value=0.0000 ✓ STATIONARY
  2. ARCH Effects: LM-statistic=1802.3912, p-value=0.0000 ✓ ARCH EFFECTS PRESENT
  3. Autocorrelation (Ljung-Box): statistic=58.8047, p-value=0.0000
  4. Normality (Jarque-Bera): statistic=54958.0595, p-value=0.0000 ✗ NON-NORMAL (expected for crypto)
  5. Distribution: skew=-0.214, excess_kurtosis=12.972 ✓ FAT TAILS
  --------------------------------------------------

  Loaded 7842 observations from 2022-01-01 00:00:00 to 2025-07-30 20:00:00
  Return statistics: mean=0.000003, std=0.021843, skew=-0.214, kurt=12.972

============================================================
Aligning timestamps across assets...
============================================================

  Original lengths: {'BTC': 7841, 'ETH': 7841, 'SOL': 7841}
  Aligned length: 7841
  Lost observations: {'BTC': 0, 'ETH': 0, 'SOL': 0}

Cross-Asset Correlation Matrix:
       BTC    ETH    SOL
BTC  1.000  0.841  0.727
ETH  0.841  1.000  0.733
SOL  0.727  0.733  1.000

Average correlation: 0.767

============================================================
DATA LOADING SUMMARY
============================================================

BTC:
  Period: 2022-01-01 00:00:00 to 2025-07-30 20:00:00
  Observations: 7,841
  Mean return: 0.000118
  Volatility (std): 0.011020

ETH:
  Period: 2022-01-01 00:00:00 to 2025-07-30 20:00:00
  Observations: 7,841
  Mean return: 0.000003
  Volatility (std): 0.014614

SOL:
  Period: 2022-01-01 00:00:00 to 2025-07-30 20:00:00
  Observations: 7,841
  Mean return: 0.000003
  Volatility (std): 0.021843

# Create interactive price chart
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=('BTC Price', 'ETH Price', 'SOL Price'),
    vertical_spacing=0.08,
    shared_xaxes=True
)

assets = ["BTC", "ETH", "SOL"]
colors = ['#F7931A', '#627EEA', '#00D4AA']  # BTC orange, ETH purple, SOL teal

for i, (asset, color) in enumerate(zip(assets, colors), 1):
    fig.add_trace(
        go.Scatter(
            x=data[asset]['prices'].index,
            y=data[asset]['prices'].values,
            name=asset,
            line=dict(color=color, width=1.5)
        ),
        row=i, col=1
    )
    
    fig.update_yaxes(title_text=f"{asset} Price (USD)", row=i, col=1)

fig.update_layout(
    height=900,
    title_text="Cryptocurrency Prices (4H Timeframe)",
    showlegend=False,
    hovermode='x unified'
)

fig.show()

print("\n📊 Key Observations:")
print("- Identify major bull/bear market transitions")
print("- Note periods of high volatility (FTX collapse, banking crisis, etc.)")
print("- Observe cross-asset co-movements")

📊 Key Observations:
- Identify major bull/bear market transitions
- Note periods of high volatility (FTX collapse, banking crisis, etc.)
- Observe cross-asset co-movements

# Create returns visualization
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

for i, (asset, color) in enumerate(zip(assets, colors)):
    axes[i].plot(
        data[asset]['returns'].index,
        data[asset]['returns'].values,
        color=color,
        linewidth=0.8,
        alpha=0.7
    )
    axes[i].set_ylabel(f'{asset} Log Return')
    axes[i].set_title(f'{asset} Daily Log Returns (4H Timeframe)')
    axes[i].axhline(y=0, color='black', linestyle='--', linewidth=0.8, alpha=0.5)
    axes[i].grid(True, alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

print("\n📊 Stylized Facts to Observe:")
print("1. Volatility Clustering: Large returns followed by large returns")
print("2. Asymmetry: Different response to positive vs negative shocks")
print("3. Fat Tails: Extreme events more frequent than normal distribution")

📊 Stylized Facts to Observe:
1. Volatility Clustering: Large returns followed by large returns
2. Asymmetry: Different response to positive vs negative shocks
3. Fat Tails: Extreme events more frequent than normal distribution

# Distribution statistics
print("="*70)
print("RETURN DISTRIBUTION STATISTICS")
print("="*70)

dist_stats = []
for asset in assets:
    returns = data[asset]['returns']
    
    stats_dict = {
        'Asset': asset,
        'Mean': returns.mean(),
        'Std Dev': returns.std(),
        'Skewness': returns.skew(),
        'Kurtosis': returns.kurtosis(),
        'Excess Kurt': returns.kurtosis(),  # pandas already returns excess kurtosis
        'Min': returns.min(),
        'Max': returns.max(),
        'VaR (95%)': returns.quantile(0.05),
        'VaR (99%)': returns.quantile(0.01),
    }
    dist_stats.append(stats_dict)

dist_df = pd.DataFrame(dist_stats)
print(dist_df.to_string(index=False))

print("\n📊 Interpretation:")
print("- Normal distribution has skewness=0 and excess kurtosis=0")
print("- Positive excess kurtosis indicates fat tails (crypto typical: 5-15)")
print("- Negative skewness indicates asymmetry toward negative returns")

======================================================================
RETURN DISTRIBUTION STATISTICS
======================================================================
Asset     Mean  Std Dev  Skewness  Kurtosis  Excess Kurt       Min      Max  VaR (95%)  VaR (99%)
  BTC 0.000118 0.011020 -0.098183  7.288630     7.288630 -0.083609 0.082603  -0.016860  -0.032498
  ETH 0.000003 0.014614 -0.347248  8.743601     8.743601 -0.150616 0.109326  -0.022373  -0.045585
  SOL 0.000003 0.021843 -0.213500 12.971876    12.971876 -0.305443 0.217012  -0.032765  -0.060800

📊 Interpretation:
- Normal distribution has skewness=0 and excess kurtosis=0
- Positive excess kurtosis indicates fat tails (crypto typical: 5-15)
- Negative skewness indicates asymmetry toward negative returns

# Histogram and density plots
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for i, (asset, color) in enumerate(zip(assets, colors)):
    returns = data[asset]['returns']
    
    # Histogram
    axes[i].hist(returns, bins=100, density=True, alpha=0.6, color=color, edgecolor='black', linewidth=0.5)
    
    # Fit normal distribution
    mu, sigma = returns.mean(), returns.std()
    x = np.linspace(returns.min(), returns.max(), 100)
    axes[i].plot(x, stats.norm.pdf(x, mu, sigma), 'r--', linewidth=2, label='Normal', alpha=0.8)
    
    # Fit Student-t distribution
    df, loc, scale = stats.t.fit(returns)
    axes[i].plot(x, stats.t.pdf(x, df, loc, scale), 'b-', linewidth=2, label=f't (df={df:.1f})', alpha=0.8)
    
    axes[i].set_title(f'{asset} Return Distribution')
    axes[i].set_xlabel('Log Return')
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📊 Key Observation:")
print("Student-t distribution (blue) fits crypto returns better than Normal (red dashed)")
print("This justifies using Student-t or Skewed-t in MS-GARCH specification")

📊 Key Observation:
Student-t distribution (blue) fits crypto returns better than Normal (red dashed)
This justifies using Student-t or Skewed-t in MS-GARCH specification

# Q-Q plots against normal distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for i, (asset, color) in enumerate(zip(assets, colors)):
    returns = data[asset]['returns'].dropna()
    
    # Compute Q-Q statistics
    qq_stats = qq_plot_statistics(returns.values, distribution='norm')
    
    # Plot
    axes[i].scatter(
        qq_stats['theoretical'],
        qq_stats['sample'],
        alpha=0.5,
        s=10,
        color=color
    )
    
    # Add 45-degree line
    lims = [
        np.min([axes[i].get_xlim(), axes[i].get_ylim()]),
        np.max([axes[i].get_xlim(), axes[i].get_ylim()]),
    ]
    axes[i].plot(lims, lims, 'r--', alpha=0.75, zorder=0)
    
    axes[i].set_title(f'{asset} Q-Q Plot (Normal)\nCorrelation: {qq_stats["correlation"]:.4f}')
    axes[i].set_xlabel('Theoretical Quantiles')
    axes[i].set_ylabel('Sample Quantiles')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📊 Q-Q Plot Interpretation:")
print("- Points on red line → data matches normal distribution")
print("- Deviations in tails → fat tails (typical for crypto)")
print("- S-shaped pattern → skewness present")

📊 Q-Q Plot Interpretation:
- Points on red line → data matches normal distribution
- Deviations in tails → fat tails (typical for crypto)
- S-shaped pattern → skewness present

# Comprehensive statistical testing
print("="*70)
print("STATISTICAL TESTS FOR GARCH MODELING ASSUMPTIONS")
print("="*70)

for asset in assets:
    returns = data[asset]['returns']
    
    print(f"\n{'='*70}")
    print(f"{asset} - Statistical Validation")
    print(f"{'='*70}")
    
    # 1. Stationarity (ADF test)
    print("\n1. STATIONARITY TEST (Augmented Dickey-Fuller)")
    print("-" * 70)
    is_stationary, adf_results = validate_stationarity(returns, verbose=True)
    
    # 2. Heteroskedasticity (ARCH-LM test)
    print("\n2. HETEROSKEDASTICITY TEST (ARCH-LM)")
    print("-" * 70)
    has_arch, arch_results = test_heteroskedasticity(returns, nlags=10, verbose=True)
    
    # 3. Autocorrelation (Ljung-Box test)
    print("\n3. AUTOCORRELATION TEST (Ljung-Box)")
    print("-" * 70)
    has_autocorr, lb_results = test_autocorrelation(returns, lags=20, verbose=True)
    
    # 4. Normality (Jarque-Bera test)
    print("\n4. NORMALITY TEST (Jarque-Bera)")
    print("-" * 70)
    is_normal, jb_results = test_normality(returns, verbose=True)
    
    print("\n" + "="*70)
    print(f"SUMMARY FOR {asset}:")
    print("="*70)
    print(f"✓ Stationary: {is_stationary} (Required for GARCH)")
    print(f"✓ ARCH Effects: {has_arch} (Justifies GARCH modeling)")
    print(f"  Autocorrelation: {has_autocorr}")
    print(f"  Normal: {is_normal} (False expected for crypto → use Student-t)")

======================================================================
STATISTICAL TESTS FOR GARCH MODELING ASSUMPTIONS
======================================================================

======================================================================
BTC - Statistical Validation
======================================================================

1. STATIONARITY TEST (Augmented Dickey-Fuller)
----------------------------------------------------------------------
ADF Statistic: -19.9030
p-value: 0.0000
Critical Values:
  1%: -3.4312
  5%: -2.8619
  10%: -2.5670
Result: STATIONARY ✓

2. HETEROSKEDASTICITY TEST (ARCH-LM)
----------------------------------------------------------------------
LM Statistic: 367.2011
LM p-value: 0.0000
F Statistic: 38.4725
F p-value: 0.0000
Result: ARCH EFFECTS PRESENT ✓

3. AUTOCORRELATION TEST (Ljung-Box)
----------------------------------------------------------------------
Significant autocorrelation at lags: [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Result: AUTOCORRELATION PRESENT ✓

4. NORMALITY TEST (Jarque-Bera)
----------------------------------------------------------------------
Jarque-Bera Statistic: 17342.9232
p-value: 0.0000
Skewness: -0.0982
Excess Kurtosis: 7.2886
Result: NON-NORMAL ✗ (expected for crypto)

======================================================================
SUMMARY FOR BTC:
======================================================================
✓ Stationary: True (Required for GARCH)
✓ ARCH Effects: True (Justifies GARCH modeling)
  Autocorrelation: True
  Normal: False (False expected for crypto → use Student-t)

======================================================================
ETH - Statistical Validation
======================================================================

1. STATIONARITY TEST (Augmented Dickey-Fuller)
----------------------------------------------------------------------
ADF Statistic: -18.0997
p-value: 0.0000
Critical Values:
  1%: -3.4312
  5%: -2.8619
  10%: -2.5670
Result: STATIONARY ✓

2. HETEROSKEDASTICITY TEST (ARCH-LM)
----------------------------------------------------------------------
LM Statistic: 454.5257
LM p-value: 0.0000
F Statistic: 48.1855
F p-value: 0.0000
Result: ARCH EFFECTS PRESENT ✓

3. AUTOCORRELATION TEST (Ljung-Box)
----------------------------------------------------------------------
Significant autocorrelation at lags: [2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Result: AUTOCORRELATION PRESENT ✓

4. NORMALITY TEST (Jarque-Bera)
----------------------------------------------------------------------
Jarque-Bera Statistic: 25098.3486
p-value: 0.0000
Skewness: -0.3472
Excess Kurtosis: 8.7436
Result: NON-NORMAL ✗ (expected for crypto)

======================================================================
SUMMARY FOR ETH:
======================================================================
✓ Stationary: True (Required for GARCH)
✓ ARCH Effects: True (Justifies GARCH modeling)
  Autocorrelation: True
  Normal: False (False expected for crypto → use Student-t)

======================================================================
SOL - Statistical Validation
======================================================================

1. STATIONARITY TEST (Augmented Dickey-Fuller)
----------------------------------------------------------------------
ADF Statistic: -37.0029
p-value: 0.0000
Critical Values:
  1%: -3.4312
  5%: -2.8619
  10%: -2.5670
Result: STATIONARY ✓

2. HETEROSKEDASTICITY TEST (ARCH-LM)
----------------------------------------------------------------------
LM Statistic: 1802.3912
LM p-value: 0.0000
F Statistic: 233.7969
F p-value: 0.0000
Result: ARCH EFFECTS PRESENT ✓

3. AUTOCORRELATION TEST (Ljung-Box)
----------------------------------------------------------------------
Significant autocorrelation at lags: [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
Result: AUTOCORRELATION PRESENT ✓

4. NORMALITY TEST (Jarque-Bera)
----------------------------------------------------------------------
Jarque-Bera Statistic: 54958.0595
p-value: 0.0000
Skewness: -0.2135
Excess Kurtosis: 12.9719
Result: NON-NORMAL ✗ (expected for crypto)

======================================================================
SUMMARY FOR SOL:
======================================================================
✓ Stationary: True (Required for GARCH)
✓ ARCH Effects: True (Justifies GARCH modeling)
  Autocorrelation: True
  Normal: False (False expected for crypto → use Student-t)

# Compute rolling realized volatility
window = 20  # 20 periods ≈ 3.3 days for 4H data

fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

for i, (asset, color) in enumerate(zip(assets, colors)):
    returns = data[asset]['returns']
    
    # Compute realized volatility (annualized)
    realized_vol = compute_realized_volatility(returns, window=window, annualization_factor=365.25*6)
    
    # Plot absolute returns (proxy for volatility)
    axes[i].plot(
        returns.index,
        returns.abs().values,
        color=color,
        alpha=0.3,
        linewidth=0.5,
        label='Absolute Returns'
    )
    
    # Plot realized volatility
    axes[i].plot(
        realized_vol.index,
        realized_vol.values,
        color=color,
        linewidth=2,
        label=f'{window}-period Rolling Volatility'
    )
    
    axes[i].set_ylabel(f'{asset} Volatility')
    axes[i].set_title(f'{asset} Volatility Clustering')
    axes[i].legend(loc='upper right')
    axes[i].grid(True, alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

print("\n📊 Volatility Clustering:")
print("- Observe periods of sustained high volatility (e.g., March 2023, August 2024)")
print("- Followed by periods of low volatility (e.g., Q2 2023, early 2024)")
print("- This pattern justifies time-varying volatility models like GARCH")

📊 Volatility Clustering:
- Observe periods of sustained high volatility (e.g., March 2023, August 2024)
- Followed by periods of low volatility (e.g., Q2 2023, early 2024)
- This pattern justifies time-varying volatility models like GARCH

# ACF and PACF of returns and squared returns
fig, axes = plt.subplots(3, 4, figsize=(16, 10))

for i, (asset, color) in enumerate(zip(assets, colors)):
    returns = data[asset]['returns'].dropna()
    squared_returns = returns ** 2
    
    # ACF of returns
    plot_acf(returns, lags=40, ax=axes[i, 0], color=color, alpha=0.05)
    axes[i, 0].set_title(f'{asset} - ACF of Returns')
    
    # PACF of returns
    plot_pacf(returns, lags=40, ax=axes[i, 1], color=color, alpha=0.05)
    axes[i, 1].set_title(f'{asset} - PACF of Returns')
    
    # ACF of squared returns
    plot_acf(squared_returns, lags=40, ax=axes[i, 2], color=color, alpha=0.05)
    axes[i, 2].set_title(f'{asset} - ACF of Squared Returns')
    
    # PACF of squared returns
    plot_pacf(squared_returns, lags=40, ax=axes[i, 3], color=color, alpha=0.05)
    axes[i, 3].set_title(f'{asset} - PACF of Squared Returns')

plt.tight_layout()
plt.show()

print("\n📊 ACF/PACF Interpretation:")
print("- Returns: Little/no autocorrelation (weak market efficiency)")
print("- Squared Returns: Strong autocorrelation (volatility clustering)")
print("- Persistent ACF in squared returns → GARCH effects present")

📊 ACF/PACF Interpretation:
- Returns: Little/no autocorrelation (weak market efficiency)
- Squared Returns: Strong autocorrelation (volatility clustering)
- Persistent ACF in squared returns → GARCH effects present

# Correlation matrix
if 'cross_asset_stats' in data:
    corr_matrix = data['cross_asset_stats']['correlation_matrix']
    
    # Heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt='.3f',
        cmap='coolwarm',
        center=0,
        square=True,
        linewidths=1,
        cbar_kws={'label': 'Correlation'}
    )
    plt.title('Return Correlation Matrix (BTC, ETH, SOL)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\n📊 Cross-Asset Correlations:")
    print(corr_matrix)
    print(f"\nAverage correlation: {data['cross_asset_stats']['average_correlation']:.3f}")
    print("\nImplications for MS-GARCH:")
    print("- High correlation suggests potential for joint regime modeling")
    print("- DCC-GARCH extension could capture time-varying correlations")
    print("- Regime synchronization analysis needed")

📊 Cross-Asset Correlations:
          BTC       ETH       SOL
BTC  1.000000  0.841402  0.727352
ETH  0.841402  1.000000  0.733183
SOL  0.727352  0.733183  1.000000

Average correlation: 0.767

Implications for MS-GARCH:
- High correlation suggests potential for joint regime modeling
- DCC-GARCH extension could capture time-varying correlations
- Regime synchronization analysis needed

# Rolling correlation analysis
window = 60  # 60 periods ≈ 10 days for 4H data

# Combine returns into DataFrame
returns_df = pd.DataFrame({
    'BTC': data['BTC']['returns'],
    'ETH': data['ETH']['returns'],
    'SOL': data['SOL']['returns'],
})

# Compute rolling correlations
rolling_corr_btc_eth = returns_df['BTC'].rolling(window=window).corr(returns_df['ETH'])
rolling_corr_btc_sol = returns_df['BTC'].rolling(window=window).corr(returns_df['SOL'])
rolling_corr_eth_sol = returns_df['ETH'].rolling(window=window).corr(returns_df['SOL'])

# Plot
plt.figure(figsize=(14, 6))
plt.plot(rolling_corr_btc_eth.index, rolling_corr_btc_eth.values, label='BTC-ETH', linewidth=2, color=colors[0])
plt.plot(rolling_corr_btc_sol.index, rolling_corr_btc_sol.values, label='BTC-SOL', linewidth=2, color=colors[1])
plt.plot(rolling_corr_eth_sol.index, rolling_corr_eth_sol.values, label='ETH-SOL', linewidth=2, color=colors[2])

plt.axhline(y=0, color='black', linestyle='--', linewidth=0.8, alpha=0.5)
plt.xlabel('Date')
plt.ylabel(f'{window}-Period Rolling Correlation')
plt.title('Time-Varying Cross-Asset Correlations', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n📊 Rolling Correlation Analysis:")
print("- Correlations vary over time (time-varying nature)")
print("- Typically increase during crisis periods (contagion effect)")
print("- Decrease during calm markets (diversification benefits)")

📊 Rolling Correlation Analysis:
- Correlations vary over time (time-varying nature)
- Typically increase during crisis periods (contagion effect)
- Decrease during calm markets (diversification benefits)

print("="*70)
print("DATA QUALITY ASSESSMENT")
print("="*70)

for asset in assets:
    print(f"\n{asset}:")
    print("-" * 70)
    
    # Check for missing values
    missing = data[asset]['returns'].isna().sum()
    print(f"Missing values: {missing} ({missing/len(data[asset]['returns'])*100:.2f}%)")
    
    # Check for extreme values
    threshold = 0.20  # 20% return in 4H
    extreme = (data[asset]['returns'].abs() > threshold).sum()
    print(f"Extreme returns (|r| > {threshold:.0%}): {extreme} ({extreme/len(data[asset]['returns'])*100:.2f}%)")
    
    if extreme > 0:
        extreme_dates = data[asset]['returns'][data[asset]['returns'].abs() > threshold]
        print(f"  Extreme return dates (first 5): {extreme_dates.head().index.tolist()}")
    
    # Check for duplicates
    duplicates = data[asset]['returns'].index.duplicated().sum()
    print(f"Duplicate timestamps: {duplicates}")
    
    # Check for gaps
    time_diffs = data[asset]['returns'].index.to_series().diff()
    expected_freq = pd.Timedelta(hours=4)
    gaps = (time_diffs > expected_freq * 2).sum()  # Gaps > 8 hours
    print(f"Large time gaps (> 8 hours): {gaps}")

print("\n" + "="*70)
print("CONCLUSION: Data quality is suitable for MS-GARCH modeling ✓")
print("="*70)

======================================================================
DATA QUALITY ASSESSMENT
======================================================================

BTC:
----------------------------------------------------------------------
Missing values: 0 (0.00%)
Extreme returns (|r| > 20%): 0 (0.00%)
Duplicate timestamps: 0
Large time gaps (> 8 hours): 0

ETH:
----------------------------------------------------------------------
Missing values: 0 (0.00%)
Extreme returns (|r| > 20%): 0 (0.00%)
Duplicate timestamps: 0
Large time gaps (> 8 hours): 0

SOL:
----------------------------------------------------------------------
Missing values: 0 (0.00%)
Extreme returns (|r| > 20%): 3 (0.04%)
  Extreme return dates (first 5): [Timestamp('2022-11-09 12:00:00'), Timestamp('2022-11-10 00:00:00'), Timestamp('2022-11-10 12:00:00')]
Duplicate timestamps: 0
Large time gaps (> 8 hours): 0

======================================================================
CONCLUSION: Data quality is suitable for MS-GARCH modeling ✓
======================================================================

#	Notebook	Article	Focus
1	01_data_exploration (this notebook)	Data Exploration	CRISP-DM methodology
2	02_model_development	Model Development	2-regime GJR-GARCH
3	03_backtesting	Backtesting	Walk-forward validation
4	04_weekly_data_research	Weekly Optimization	Frequency analysis

📚 Appendix: Portfolio Article¶

Published Article¶

Main Reference¶

Phase 1: Data Understanding and Exploration¶

MS-GARCH Regime Detection for Cryptocurrency Markets¶

Executive Summary¶

Objectives¶

Key Findings (to be populated)¶

1. Setup and Configuration¶

2. Data Loading¶

3. Price and Return Visualization¶

4. Distribution Analysis¶

5. Statistical Tests¶

6. Volatility Clustering Analysis¶

7. Cross-Asset Analysis¶

8. Data Quality Assessment¶

9. Key Findings and Next Steps¶

Summary of Findings¶

Model Specification Recommendations¶

Next Steps¶

📚 Appendix: Portfolio Article¶

Published Article¶

Related Research in This Series¶

Main Reference¶

Phase 1: Data Understanding and Exploration¶

MS-GARCH Regime Detection for Cryptocurrency Markets¶

Executive Summary¶

Objectives¶

Key Findings (to be populated)¶

1. Setup and Configuration¶

2. Data Loading¶

3. Price and Return Visualization¶

4. Distribution Analysis¶

5. Statistical Tests¶

6. Volatility Clustering Analysis¶

7. Cross-Asset Analysis¶

8. Data Quality Assessment¶

9. Key Findings and Next Steps¶

Summary of Findings¶

Model Specification Recommendations¶

Next Steps¶