Loading all datasets from different sources

Code
import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np

from loguru import logger

::: {#cell-2 .cell 0=‘h’ 1=‘i’ 2=‘d’ 3=‘e’ execution_count=2}

Code
%load_ext autoreload
%autoreload 2

:::

Code
from discontinuitypy.utils.basic import load_catalog

catalog = load_catalog()
Code
from discontinuitypy.datasets import cIDsDataset

sta_dataset = cIDsDataset(sat_id="STA", tau=60, ts=1, catalog=catalog)
jno_dataset = cIDsDataset(sat_id="JNO", tau=60, ts=1, catalog=catalog)
thb_dataset = cIDsDataset(sat_id="THB", tau=60, ts=1, catalog=catalog)
16-Nov-23 23:42:22 INFO     16-Nov-23 23:42:22: Loading data from 'events.STA_ts_1s_tau_60s'    data_catalog.py:502
                            (LazyPolarsDataset)...                                                                 
                   INFO     16-Nov-23 23:42:22: Loading data from 'STA.MAG.primary_data_ts_1s'  data_catalog.py:502
                            (PartitionedDataset)...                                                                
                   INFO     16-Nov-23 23:42:22: Loading data from 'events.JNO_ts_1s_tau_60s'    data_catalog.py:502
                            (LazyPolarsDataset)...                                                                 
                   INFO     16-Nov-23 23:42:22: Loading data from 'JNO.MAG.primary_data_ts_1s'  data_catalog.py:502
                            (PartitionedDataset)...                                                                
                   INFO     16-Nov-23 23:42:22: Loading data from 'events.THB_ts_1s_tau_60s'    data_catalog.py:502
                            (LazyPolarsDataset)...                                                                 
                   INFO     16-Nov-23 23:42:22: Loading data from 'THB.MAG.primary_data_ts_1s'  data_catalog.py:502
                            (PartitionedDataset)...                                                                
Code
from beforerr.basics import pmap
from discontinuitypy.utils.analysis import filter_tranges_ds
Code
thb_inter_state_sw: pl.LazyFrame = catalog.load('thb.inter_state_sw')
start, end = thb_inter_state_sw.select(['start', 'end']).collect()

thb_sw_dataset = filter_tranges_ds(thb_dataset, (start, end))
[11/13/23 20:28:03] INFO     Loading data from 'thb.inter_state_sw' (LazyPolarsDataset)...      data_catalog.py:502
Code
all_datasets = [sta_dataset, jno_dataset, thb_sw_dataset]
Code
all_candidates_l0 : pl.DataFrame = pl.concat(
    all_datasets | pmap(lambda x: x.candidates),
    how="diagonal",
)
Code
def combine_candidates(datasets):
    return pl.concat(
        datasets | pmap(lambda x: x.candidates),
        how="diagonal",
    )

Processing datasets

Some extreme values are present in the data. We will remove them.

Code

NVARS = ['d_star', 'L_mn', 'L_mn_norm', 'j0', 'j0_norm', 'duration', 'v_mn']
DISPLAY_VARS = ['time', 'sat'] + NVARS


def check_candidates(df):
    return df[NVARS].describe()

check_candidates(all_candidates_l0)

shape: (9, 8)
describe d_star L_mn L_mn_norm j0 j0_norm duration v_mn
str f64 f64 f64 f64 f64 str f64
"count" 185066.0 185066.0 185066.0 185066.0 185066.0 "185066" 185066.0
"null_count" 0.0 4120.0 4389.0 4120.0 4389.0 "0" 4120.0
"mean" 2.611712 2798.843381 22.307474 11.654787 4.713652 "0:00:08.198437… 343.811034
"std" 491.756741 2179.474212 20.649185 2894.040891 1473.838227 null 99.930132
"min" 0.019601 3.381065 0.014144 0.0561 0.00082 "0:00:01.999999… 0.41411
"25%" 0.247087 1582.102536 11.284664 0.601477 0.028203 "0:00:05" 286.126017
"50%" 0.510951 2240.279834 17.513617 1.239019 0.051221 "0:00:07" 343.325961
"75%" 0.983944 3346.020528 27.236719 2.34897 0.091488 "0:00:10" 402.282733
"max" 152023.367594 103745.212024 1614.132093 1.1500e6 583059.205803 "0:03:16" 864.604665
Code
from datetime import timedelta
def process_candidates_l1(raw_df: pl.DataFrame):
    "clean data to remove extreme values"

    df = raw_df.filter(
        pl.col("d_star") < 100, # exclude JUNO extreme values
        pl.col('v_mn') > 10,
        pl.col('duration') < timedelta(seconds=60),
        # pl.col("j0") < 100
    ).with_columns(
        pl.col('radial_distance').fill_null(1) # by default, fill with 1 AU
    ).with_columns(
        r_bin = pl.col('radial_distance').round(),
        j0_norm_log = pl.col('j0_norm').log10(),
        L_mn_norm_log = pl.col('L_mn_norm').log10(),
    )

    logger.info(
        f"candidates_l1: {len(df)}, with effective ratio: {len(df) / len(raw_df):.2%}"
    )

    return df

all_candidates_l1 = process_candidates_l1(all_candidates_l0)
%R -i all_candidates_l1 -c conv_pl
check_candidates(all_candidates_l1)
2023-11-08 14:11:23.225 | INFO     | __main__:process_candidates_l1:18 - candidates_l1: 180718, with effective ratio: 97.65%

shape: (9, 8)
describe d_star L_mn L_mn_norm j0 j0_norm duration v_mn
str f64 f64 f64 f64 f64 str f64
"count" 180718.0 180718.0 180718.0 180718.0 180718.0 "180718" 180718.0
"null_count" 0.0 0.0 264.0 0.0 264.0 "0" 0.0
"mean" 0.745751 2768.506268 22.033678 1.865352 0.075518 "0:00:08.118150… 343.880697
"std" 0.771981 1909.065522 17.629565 2.599027 0.097857 null 99.846681
"min" 0.019601 48.94197 0.124168 0.0561 0.00082 "0:00:01.999999… 10.240242
"25%" 0.243875 1581.58393 11.279769 0.60174 0.028229 "0:00:05" 286.190021
"50%" 0.50421 2238.553736 17.499906 1.239576 0.051251 "0:00:07" 343.360031
"75%" 0.97075 3340.552536 27.186555 2.348456 0.091501 "0:00:10" 402.301723
"max" 13.805873 35975.767016 439.323024 393.479096 9.634978 "0:00:59" 864.604665
Code
jno_candidates_l1 = all_candidates_l1.filter(pl.col('sat') == 'JNO')
%R -i jno_candidates_l1 -c conv_pl
Code
from discontinuitypy.utils.analysis import filter_before_jupiter
from discontinuitypy.utils.analysis import link_coord2dim
Code
def process_candidates_l2(raw_df: pl.DataFrame, avg_window="30d"):
    time_col = "time"

    candidate = (
        raw_df.sort(time_col)
        .group_by_dynamic(time_col, every=avg_window, by="sat")
        .agg(cs.numeric().mean(), cs.duration().mean(), id_count=pl.count())
        .filter(pl.col("id_count") > 50)  # filter out JUNO extreme large thickness
        .sort(time_col)
        .upsample(time_col, every=avg_window, by="sat", maintain_order=True)
        .with_columns(pl.col("sat").forward_fill())
    )
    return candidate
Code
all_candidates_l2: pl.DataFrame = (
    all_candidates_l1.pipe(filter_before_jupiter)
    .pipe(process_candidates_l2)
    .pipe(link_coord2dim)
)
Code
inspect_df = all_candidates_l2[NVARS]
inspect_df.describe()

shape: (9, 8)
describe d_star L_mn L_mn_norm j0 j0_norm duration v_mn
str f64 f64 f64 f64 f64 str f64
"count" 172.0 172.0 172.0 172.0 172.0 "172" 172.0
"null_count" 19.0 19.0 19.0 19.0 19.0 "19" 19.0
"mean" 0.706261 2922.959632 22.028999 1.937378 0.090728 "0:00:08.719631… 337.428018
"std" 0.358616 512.032439 8.140589 1.077249 0.051647 null 37.917741
"min" 0.108318 1877.983131 7.074407 0.229362 0.042024 "0:00:06.751012… 256.771354
"25%" 0.331532 2590.280777 14.498058 0.795284 0.060267 "0:00:07.707419… 315.324913
"50%" 0.794667 2786.745403 22.804505 2.087583 0.069789 "0:00:08.730158… 335.332916
"75%" 0.931735 3182.843841 27.726721 2.633037 0.094061 "0:00:09.315238… 359.837854
"max" 1.539393 4458.507484 41.436617 4.784021 0.306938 "0:00:12.305699… 445.849288
Code
from discontinuitypy.utils.analysis import n2_normalize

all_candidates_l2_n2 = n2_normalize(all_candidates_l2, NVARS)