Code
import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np
from loguru import logger
::: {#cell-2 .cell 0=‘h’ 1=‘i’ 2=‘d’ 3=‘e’ execution_count=2}
:::
16-Nov-23 23:42:22 INFO 16-Nov-23 23:42:22: Loading data from 'events.STA_ts_1s_tau_60s' data_catalog.py:502 (LazyPolarsDataset)...
INFO 16-Nov-23 23:42:22: Loading data from 'STA.MAG.primary_data_ts_1s' data_catalog.py:502 (PartitionedDataset)...
INFO 16-Nov-23 23:42:22: Loading data from 'events.JNO_ts_1s_tau_60s' data_catalog.py:502 (LazyPolarsDataset)...
INFO 16-Nov-23 23:42:22: Loading data from 'JNO.MAG.primary_data_ts_1s' data_catalog.py:502 (PartitionedDataset)...
INFO 16-Nov-23 23:42:22: Loading data from 'events.THB_ts_1s_tau_60s' data_catalog.py:502 (LazyPolarsDataset)...
INFO 16-Nov-23 23:42:22: Loading data from 'THB.MAG.primary_data_ts_1s' data_catalog.py:502 (PartitionedDataset)...
[11/13/23 20:28:03] INFO Loading data from 'thb.inter_state_sw' (LazyPolarsDataset)... data_catalog.py:502
Some extreme values are present in the data. We will remove them.
describe | d_star | L_mn | L_mn_norm | j0 | j0_norm | duration | v_mn |
---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | str | f64 |
"count" | 185066.0 | 185066.0 | 185066.0 | 185066.0 | 185066.0 | "185066" | 185066.0 |
"null_count" | 0.0 | 4120.0 | 4389.0 | 4120.0 | 4389.0 | "0" | 4120.0 |
"mean" | 2.611712 | 2798.843381 | 22.307474 | 11.654787 | 4.713652 | "0:00:08.198437… | 343.811034 |
"std" | 491.756741 | 2179.474212 | 20.649185 | 2894.040891 | 1473.838227 | null | 99.930132 |
"min" | 0.019601 | 3.381065 | 0.014144 | 0.0561 | 0.00082 | "0:00:01.999999… | 0.41411 |
"25%" | 0.247087 | 1582.102536 | 11.284664 | 0.601477 | 0.028203 | "0:00:05" | 286.126017 |
"50%" | 0.510951 | 2240.279834 | 17.513617 | 1.239019 | 0.051221 | "0:00:07" | 343.325961 |
"75%" | 0.983944 | 3346.020528 | 27.236719 | 2.34897 | 0.091488 | "0:00:10" | 402.282733 |
"max" | 152023.367594 | 103745.212024 | 1614.132093 | 1.1500e6 | 583059.205803 | "0:03:16" | 864.604665 |
from datetime import timedelta
def process_candidates_l1(raw_df: pl.DataFrame):
"clean data to remove extreme values"
df = raw_df.filter(
pl.col("d_star") < 100, # exclude JUNO extreme values
pl.col('v_mn') > 10,
pl.col('duration') < timedelta(seconds=60),
# pl.col("j0") < 100
).with_columns(
pl.col('radial_distance').fill_null(1) # by default, fill with 1 AU
).with_columns(
r_bin = pl.col('radial_distance').round(),
j0_norm_log = pl.col('j0_norm').log10(),
L_mn_norm_log = pl.col('L_mn_norm').log10(),
)
logger.info(
f"candidates_l1: {len(df)}, with effective ratio: {len(df) / len(raw_df):.2%}"
)
return df
all_candidates_l1 = process_candidates_l1(all_candidates_l0)
%R -i all_candidates_l1 -c conv_pl
check_candidates(all_candidates_l1)
2023-11-08 14:11:23.225 | INFO | __main__:process_candidates_l1:18 - candidates_l1: 180718, with effective ratio: 97.65%
describe | d_star | L_mn | L_mn_norm | j0 | j0_norm | duration | v_mn |
---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | str | f64 |
"count" | 180718.0 | 180718.0 | 180718.0 | 180718.0 | 180718.0 | "180718" | 180718.0 |
"null_count" | 0.0 | 0.0 | 264.0 | 0.0 | 264.0 | "0" | 0.0 |
"mean" | 0.745751 | 2768.506268 | 22.033678 | 1.865352 | 0.075518 | "0:00:08.118150… | 343.880697 |
"std" | 0.771981 | 1909.065522 | 17.629565 | 2.599027 | 0.097857 | null | 99.846681 |
"min" | 0.019601 | 48.94197 | 0.124168 | 0.0561 | 0.00082 | "0:00:01.999999… | 10.240242 |
"25%" | 0.243875 | 1581.58393 | 11.279769 | 0.60174 | 0.028229 | "0:00:05" | 286.190021 |
"50%" | 0.50421 | 2238.553736 | 17.499906 | 1.239576 | 0.051251 | "0:00:07" | 343.360031 |
"75%" | 0.97075 | 3340.552536 | 27.186555 | 2.348456 | 0.091501 | "0:00:10" | 402.301723 |
"max" | 13.805873 | 35975.767016 | 439.323024 | 393.479096 | 9.634978 | "0:00:59" | 864.604665 |
def process_candidates_l2(raw_df: pl.DataFrame, avg_window="30d"):
time_col = "time"
candidate = (
raw_df.sort(time_col)
.group_by_dynamic(time_col, every=avg_window, by="sat")
.agg(cs.numeric().mean(), cs.duration().mean(), id_count=pl.count())
.filter(pl.col("id_count") > 50) # filter out JUNO extreme large thickness
.sort(time_col)
.upsample(time_col, every=avg_window, by="sat", maintain_order=True)
.with_columns(pl.col("sat").forward_fill())
)
return candidate
describe | d_star | L_mn | L_mn_norm | j0 | j0_norm | duration | v_mn |
---|---|---|---|---|---|---|---|
str | f64 | f64 | f64 | f64 | f64 | str | f64 |
"count" | 172.0 | 172.0 | 172.0 | 172.0 | 172.0 | "172" | 172.0 |
"null_count" | 19.0 | 19.0 | 19.0 | 19.0 | 19.0 | "19" | 19.0 |
"mean" | 0.706261 | 2922.959632 | 22.028999 | 1.937378 | 0.090728 | "0:00:08.719631… | 337.428018 |
"std" | 0.358616 | 512.032439 | 8.140589 | 1.077249 | 0.051647 | null | 37.917741 |
"min" | 0.108318 | 1877.983131 | 7.074407 | 0.229362 | 0.042024 | "0:00:06.751012… | 256.771354 |
"25%" | 0.331532 | 2590.280777 | 14.498058 | 0.795284 | 0.060267 | "0:00:07.707419… | 315.324913 |
"50%" | 0.794667 | 2786.745403 | 22.804505 | 2.087583 | 0.069789 | "0:00:08.730158… | 335.332916 |
"75%" | 0.931735 | 3182.843841 | 27.726721 | 2.633037 | 0.094061 | "0:00:09.315238… | 359.837854 |
"max" | 1.539393 | 4458.507484 | 41.436617 | 4.784021 | 0.306938 | "0:00:12.305699… | 445.849288 |