Loading all datasets from different sources

Code

import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np

from loguru import logger

::: {#cell-2 .cell 0=‘h’ 1=‘i’ 2=‘d’ 3=‘e’ execution_count=2}

Code

%load_ext autoreload
%autoreload 2

:::

Code

from discontinuitypy.utils.basic import load_catalog

catalog = load_catalog()

Code

from discontinuitypy.datasets import cIDsDataset

sta_dataset = cIDsDataset(sat_id="STA", tau=60, ts=1, catalog=catalog)
jno_dataset = cIDsDataset(sat_id="JNO", tau=60, ts=1, catalog=catalog)
thb_dataset = cIDsDataset(sat_id="THB", tau=60, ts=1, catalog=catalog)

16-Nov-23 23:42:22 INFO     16-Nov-23 23:42:22: Loading data from 'events.STA_ts_1s_tau_60s'    data_catalog.py:502
                            (LazyPolarsDataset)...

                   INFO     16-Nov-23 23:42:22: Loading data from 'STA.MAG.primary_data_ts_1s'  data_catalog.py:502
                            (PartitionedDataset)...

                   INFO     16-Nov-23 23:42:22: Loading data from 'events.JNO_ts_1s_tau_60s'    data_catalog.py:502
                            (LazyPolarsDataset)...

                   INFO     16-Nov-23 23:42:22: Loading data from 'JNO.MAG.primary_data_ts_1s'  data_catalog.py:502
                            (PartitionedDataset)...

                   INFO     16-Nov-23 23:42:22: Loading data from 'events.THB_ts_1s_tau_60s'    data_catalog.py:502
                            (LazyPolarsDataset)...

                   INFO     16-Nov-23 23:42:22: Loading data from 'THB.MAG.primary_data_ts_1s'  data_catalog.py:502
                            (PartitionedDataset)...

Code

from beforerr.basics import pmap
from discontinuitypy.utils.analysis import filter_tranges_ds

Code

thb_inter_state_sw: pl.LazyFrame = catalog.load('thb.inter_state_sw')
start, end = thb_inter_state_sw.select(['start', 'end']).collect()

thb_sw_dataset = filter_tranges_ds(thb_dataset, (start, end))

[11/13/23 20:28:03] INFO     Loading data from 'thb.inter_state_sw' (LazyPolarsDataset)...      data_catalog.py:502

Code

all_datasets = [sta_dataset, jno_dataset, thb_sw_dataset]

Code

all_candidates_l0 : pl.DataFrame = pl.concat(
    all_datasets | pmap(lambda x: x.candidates),
    how="diagonal",
)

Code

def combine_candidates(datasets):
    return pl.concat(
        datasets | pmap(lambda x: x.candidates),
        how="diagonal",
    )

Processing datasets

Some extreme values are present in the data. We will remove them.

Code


NVARS = ['d_star', 'L_mn', 'L_mn_norm', 'j0', 'j0_norm', 'duration', 'v_mn']
DISPLAY_VARS = ['time', 'sat'] + NVARS


def check_candidates(df):
    return df[NVARS].describe()

check_candidates(all_candidates_l0)

shape: (9, 8)

describe	d_star	L_mn	L_mn_norm	j0	j0_norm	duration	v_mn
str	f64	f64	f64	f64	f64	str	f64
"count"	185066.0	185066.0	185066.0	185066.0	185066.0	"185066"	185066.0
"null_count"	0.0	4120.0	4389.0	4120.0	4389.0	"0"	4120.0
"mean"	2.611712	2798.843381	22.307474	11.654787	4.713652	"0:00:08.198437…	343.811034
"std"	491.756741	2179.474212	20.649185	2894.040891	1473.838227	null	99.930132
"min"	0.019601	3.381065	0.014144	0.0561	0.00082	"0:00:01.999999…	0.41411
"25%"	0.247087	1582.102536	11.284664	0.601477	0.028203	"0:00:05"	286.126017
"50%"	0.510951	2240.279834	17.513617	1.239019	0.051221	"0:00:07"	343.325961
"75%"	0.983944	3346.020528	27.236719	2.34897	0.091488	"0:00:10"	402.282733
"max"	152023.367594	103745.212024	1614.132093	1.1500e6	583059.205803	"0:03:16"	864.604665

Code

from datetime import timedelta
def process_candidates_l1(raw_df: pl.DataFrame):
    "clean data to remove extreme values"

    df = raw_df.filter(
        pl.col("d_star") < 100, # exclude JUNO extreme values
        pl.col('v_mn') > 10,
        pl.col('duration') < timedelta(seconds=60),
        # pl.col("j0") < 100
    ).with_columns(
        pl.col('radial_distance').fill_null(1) # by default, fill with 1 AU
    ).with_columns(
        r_bin = pl.col('radial_distance').round(),
        j0_norm_log = pl.col('j0_norm').log10(),
        L_mn_norm_log = pl.col('L_mn_norm').log10(),
    )

    logger.info(
        f"candidates_l1: {len(df)}, with effective ratio: {len(df) / len(raw_df):.2%}"
    )

    return df

all_candidates_l1 = process_candidates_l1(all_candidates_l0)
%R -i all_candidates_l1 -c conv_pl
check_candidates(all_candidates_l1)

2023-11-08 14:11:23.225 | INFO     | __main__:process_candidates_l1:18 - candidates_l1: 180718, with effective ratio: 97.65%

shape: (9, 8)

describe	d_star	L_mn	L_mn_norm	j0	j0_norm	duration	v_mn
str	f64	f64	f64	f64	f64	str	f64
"count"	180718.0	180718.0	180718.0	180718.0	180718.0	"180718"	180718.0
"null_count"	0.0	0.0	264.0	0.0	264.0	"0"	0.0
"mean"	0.745751	2768.506268	22.033678	1.865352	0.075518	"0:00:08.118150…	343.880697
"std"	0.771981	1909.065522	17.629565	2.599027	0.097857	null	99.846681
"min"	0.019601	48.94197	0.124168	0.0561	0.00082	"0:00:01.999999…	10.240242
"25%"	0.243875	1581.58393	11.279769	0.60174	0.028229	"0:00:05"	286.190021
"50%"	0.50421	2238.553736	17.499906	1.239576	0.051251	"0:00:07"	343.360031
"75%"	0.97075	3340.552536	27.186555	2.348456	0.091501	"0:00:10"	402.301723
"max"	13.805873	35975.767016	439.323024	393.479096	9.634978	"0:00:59"	864.604665

Code

jno_candidates_l1 = all_candidates_l1.filter(pl.col('sat') == 'JNO')
%R -i jno_candidates_l1 -c conv_pl

Code

from discontinuitypy.utils.analysis import filter_before_jupiter
from discontinuitypy.utils.analysis import link_coord2dim

Code

def process_candidates_l2(raw_df: pl.DataFrame, avg_window="30d"):
    time_col = "time"

    candidate = (
        raw_df.sort(time_col)
        .group_by_dynamic(time_col, every=avg_window, by="sat")
        .agg(cs.numeric().mean(), cs.duration().mean(), id_count=pl.count())
        .filter(pl.col("id_count") > 50)  # filter out JUNO extreme large thickness
        .sort(time_col)
        .upsample(time_col, every=avg_window, by="sat", maintain_order=True)
        .with_columns(pl.col("sat").forward_fill())
    )
    return candidate

Code

all_candidates_l2: pl.DataFrame = (
    all_candidates_l1.pipe(filter_before_jupiter)
    .pipe(process_candidates_l2)
    .pipe(link_coord2dim)
)

Code

inspect_df = all_candidates_l2[NVARS]
inspect_df.describe()

shape: (9, 8)

describe	d_star	L_mn	L_mn_norm	j0	j0_norm	duration	v_mn
str	f64	f64	f64	f64	f64	str	f64
"count"	172.0	172.0	172.0	172.0	172.0	"172"	172.0
"null_count"	19.0	19.0	19.0	19.0	19.0	"19"	19.0
"mean"	0.706261	2922.959632	22.028999	1.937378	0.090728	"0:00:08.719631…	337.428018
"std"	0.358616	512.032439	8.140589	1.077249	0.051647	null	37.917741
"min"	0.108318	1877.983131	7.074407	0.229362	0.042024	"0:00:06.751012…	256.771354
"25%"	0.331532	2590.280777	14.498058	0.795284	0.060267	"0:00:07.707419…	315.324913
"50%"	0.794667	2786.745403	22.804505	2.087583	0.069789	"0:00:08.730158…	335.332916
"75%"	0.931735	3182.843841	27.726721	2.633037	0.094061	"0:00:09.315238…	359.837854
"max"	1.539393	4458.507484	41.436617	4.784021	0.306938	"0:00:12.305699…	445.849288

Code

from discontinuitypy.utils.analysis import n2_normalize

all_candidates_l2_n2 = n2_normalize(all_candidates_l2, NVARS)