Occurence Rate

Setup

Import libraries

Code

from fastcore.utils import patch
from fastcore.test import *

from discontinuitypy.utils.basic import load_catalog
from discontinuitypy.utils.basic import filter_tranges_df
from discontinuitypy.utils.analysis import filter_before_jupiter
from discontinuitypy.utils.analysis import link_coord2dim
from discontinuitypy.datasets import cIDsDataset

from beforerr.basics import pmap
from beforerr.r import py2rpy_polars
import rpy2.robjects as robjects

from datetime import timedelta

import polars as pl
import polars.selectors as cs
import pandas as pd

import warnings

# This will filter out all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%load_ext autoreload
%autoreload 2

04-Dec-23 09:02:03 INFO     04-Dec-23 09:02:03: cffi mode is CFFI_MODE.ANY                          situation.py:41

                   INFO     04-Dec-23 09:02:03: R home found:                                      situation.py:218
                            /Library/Frameworks/R.framework/Resources

                   INFO     04-Dec-23 09:02:03: R library path:                                    situation.py:161

                   INFO     04-Dec-23 09:02:03: LD_LIBRARY_PATH:                                   situation.py:165

                   INFO     04-Dec-23 09:02:03: Default options to initialize R: rpy2, --quiet,      embedded.py:20
                            --no-save

                   INFO     04-Dec-23 09:02:03: R is already initialized. No need to initialize.    embedded.py:269

Code

catalog = load_catalog()

[12/04/23 09:02:04] WARNING  /Users/zijin/miniforge3/envs/cool_planet/lib/python3.10/site-packages/ke logger.py:205
                             dro_datasets/polars/lazy_polars_dataset.py:14: KedroDeprecationWarning:               
                             'AbstractVersionedDataSet' has been renamed to                                        
                             'AbstractVersionedDataset', and the alias will be removed in Kedro                    
                             0.19.0                                                                                
                               from kedro.io.core import (

Connect python with R kernel

Code

%load_ext rpy2.ipython

r = robjects.r
r.source('utils.R')

conv_pl = py2rpy_polars()

[12/04/23 09:02:05] WARNING  R[write to console]:                                                  callbacks.py:124
                             Attaching package: ‘dplyr’

                    WARNING  R[write to console]: The following objects are masked from            callbacks.py:124
                             ‘package:stats’:                                                                      
                                                                                                                   
                                 filter, lag

                    WARNING  R[write to console]: The following objects are masked from            callbacks.py:124
                             ‘package:base’:                                                                       
                                                                                                                   
                                 intersect, setdiff, setequal, union

Loading all datasets from different sources

Code

STA_ds = cIDsDataset(sat_id="STA", tau=60, ts=1, catalog=catalog)
JNO_ds = cIDsDataset(sat_id="JNO", tau=60, ts=1, catalog=catalog)
THB_ds = cIDsDataset(sat_id="THB", tau=60, ts=1, catalog=catalog)
Wind_ds = cIDsDataset(sat_id="Wind", tau=60, ts=1, catalog=catalog)

                    INFO     Loading data from 'events.STA_ts_1s_tau_60s'                       data_catalog.py:502
                             (LazyPolarsDataset)...

                    INFO     Loading data from 'STA.MAG.primary_data_ts_1s'                     data_catalog.py:502
                             (PartitionedDataset)...

                    INFO     Loading data from 'events.JNO_ts_1s_tau_60s'                       data_catalog.py:502
                             (LazyPolarsDataset)...

                    INFO     Loading data from 'JNO.MAG.primary_data_ts_1s'                     data_catalog.py:502
                             (PartitionedDataset)...

                    INFO     Loading data from 'events.THB_ts_1s_tau_60s'                       data_catalog.py:502
                             (LazyPolarsDataset)...

                    INFO     Loading data from 'THB.MAG.primary_data_ts_1s'                     data_catalog.py:502
                             (PartitionedDataset)...

                    INFO     Loading data from 'events.Wind_ts_1s_tau_60s'                      data_catalog.py:502
                             (LazyPolarsDataset)...

                    INFO     Loading data from 'Wind.MAG.primary_data_ts_1s'                    data_catalog.py:502
                             (PartitionedDataset)...

ARTEMIS missions needs additional care as they are not always in the solar wind.

Code

thb_inter_state_sw: pl.LazyFrame = catalog.load('THB.STATE.inter_data_sw')
start, end = thb_inter_state_sw.select(['start', 'end']).collect()

THB_sw_ds = cIDsDataset(
    sat_id="THB", tau=60, ts=1, catalog=catalog,
    candidates = filter_tranges_df(THB_ds.candidates, (start, end)), 
    data = filter_tranges_df(THB_ds.data.collect(), (start, end)).lazy()
)

THB_sw_ds = THB_ds.copy(
    update=dict(
        candidates = THB_ds.candidates.pipe(filter_tranges_df, (start, end)),
        data = THB_ds.data.collect().pipe(filter_tranges_df, (start, end)).lazy()
    )
)

                    INFO     Loading data from 'THB.STATE.inter_data_sw' (LazyPolarsDataset)... data_catalog.py:502

Code

all_ds = [JNO_ds, Wind_ds, STA_ds, THB_sw_ds]

Distance and Occurrence rates versus time for JUNO

Code

def calc_or_df(events: pl.DataFrame, avg_window: timedelta, by=None):
    """Calculate the occurence rate of the candidates with the average window.

    Notes: occurence rate is defined as the number of candidates per day.
    """

    every = avg_window
    or_factor = every / timedelta(days=1)

    return (
        events.sort("time")
        .group_by_dynamic("time", every=every, by=by)
        .agg(
            cs.float().mean(),
            o_rates=pl.count() / or_factor
        )
        .upsample("time", every=every)  # upsample to fill the missing time
    )

Code

df = JNO_ds.candidates.pipe(calc_or_df, timedelta(days=5))
%R -i df -c conv_pl

Code

%%R
p1 <- ggplot(df, aes(x = time, y = radial_distance)) + 
  geom_line() + # Plot distance by date
  labs(x = "Date", y = "Distance (AU)") +
  theme_pubr(base_size = 16) + 
  theme(aspect.ratio=0.25)
  
p2 <- ggplot(df, aes(x = time, y = o_rates)) + 
  geom_line() + # Plot distance by date
  labs(x = "Date", y = "Occurrence Rates (#/day)") +
  theme_pubr(base_size = 16) + 
  theme(aspect.ratio=0.25)

p <- ggarrange(p1, p2, nrow = 2)

# save_plot("distance_and_or")
p

Occurrence rates versus time for all missions

Data normalization

Different levels of normalization are applied to the data. The normalization is done in the following order:

N1: normalize the data by the the effective time of every duration due to the data gap as we may miss some potential IDs. We assume the data gap is independent of the magnetic discontinuities.
N2: normalize the data by the mean value of data near 1 AU. This is to remove the effect of the temporal variation of the solar wind.

Code

def calc_n1_factor(
    data: pl.LazyFrame,
    s_resolution: timedelta,
    l_resolution: timedelta,
):
    return (
        data.sort("time")
        .group_by_dynamic("time", every=s_resolution)
        .agg(pl.lit(1).alias("availablity"))
        .group_by_dynamic("time", every=l_resolution)
        .agg(n1_factor=pl.sum("availablity") * s_resolution / l_resolution)
    )


def n1_normalize(
    df: pl.DataFrame,  # the dataframe with count to be normalized
    data: pl.LazyFrame,  # the data used to calculate the duration ratio
    s_resolution,  # the smallest resolution to check if the data is available
    avg_window,
):
    duration_df = calc_n1_factor(data, s_resolution, avg_window).with_columns(
        cs.datetime().dt.cast_time_unit("ns"),
    )

    return df.lazy().join(duration_df, how="left", on="time").with_columns(
        o_rates_normalized=pl.col("o_rates") / pl.col("n1_factor")
    ).collect()
    
@patch
def calc_or_normalized(
    self: cIDsDataset, s_resolution: timedelta, avg_window: timedelta
):
    count_df = calc_or_df(self.candidates, avg_window)
    return n1_normalize(count_df, self.data, s_resolution, avg_window)

Code

def n2_normalize(df: pl.DataFrame, avg_sats = ["STA", "THB", "Wind"]):
    avg_df = (
        df.filter(pl.col("sat").is_in(avg_sats))
        .group_by("time")
        .agg(n2_factor=pl.mean("o_rates_normalized"))
    )
    return df.join(avg_df, on="time").with_columns(
        o_rates_normalized=pl.col("o_rates_normalized") / pl.col("n2_factor")
    )

Code

s_resolution = timedelta(minutes=1)
avg_window = timedelta(days=30)

Code

all_events_or_N1: pl.DataFrame = pl.concat(
    all_ds
    | pmap(
        lambda x: x.calc_or_normalized(s_resolution, avg_window).with_columns(
            sat=pl.lit(x.sat_id)
        )
    ),
    how="diagonal",
)

Code

all_events_or_N2 = all_events_or_N1.pipe(n2_normalize)

Code

%%R
plot_or_time <- function(df) {
  p <- ggline(
    df, x = "time", y = "o_rates_normalized", 
    color = "sat", linetype = "sat") 
  
  p +   
    labs(x = "Date", y = "Occurrence Rates (#/day)", color="Satellites", linetype="Satellites") + 
    theme_pubr(base_size = 16) + 
    theme(legend.text = element_text(size=16)) +
    scale_color_okabeito(palette = "black_first")
}

Code

%%R -i all_events_or_N1 -c conv_pl
p <- plot_or_time(all_events_or_N1)
p

[12/04/23 09:02:37] WARNING  R[write to console]: In addition:                                     callbacks.py:124

                    WARNING  R[write to console]: Warning message:                                 callbacks.py:124

                    WARNING  R[write to console]: Removed 6 rows containing missing values         callbacks.py:124
                             (`geom_point()`).

We noticed some anomalies in the occurrence rates of the magnetic discontinuities for Stereo-A data. Also for Juno, its occurrence rate is much higher when approaching Jupiter.

Code

all_events_or_N1.filter(
    pl.col('time').is_in(pd.date_range('2014-01-01', '2015-01-01')),
    sat='STA'
)[["time", "o_rates", "o_rates_normalized", 'n1_factor']]

shape: (12, 4)

time	o_rates	o_rates_normalized	n1_factor
datetime[ns]	f64	f64	f64
2014-01-10 00:00:00	64.466667	67.349278	0.957199
2014-02-09 00:00:00	71.2	71.232978	0.999537
2014-03-11 00:00:00	71.666667	74.137931	0.966667
2014-04-10 00:00:00	72.2	72.210029	0.999861
2014-05-10 00:00:00	74.166667	74.182121	0.999792
2014-06-09 00:00:00	65.733333	71.530265	0.918958
2014-07-09 00:00:00	68.233333	76.778496	0.888704
2014-08-08 00:00:00	4.033333	9.013968	0.447454
2014-09-07 00:00:00	11.0	69.53468	0.158194
2014-10-07 00:00:00	6.6	77.163735	0.085532
2014-11-06 00:00:00	9.866667	66.548009	0.148264
2014-12-06 00:00:00	9.766667	63.686038	0.153356

Surprisingly, we found out that the anomaly of STEREO-A data is not mainly due to data gap. We can inspect this data further. See appendix.

We remove the intervals which do not have enough data points to calculate the normalized occurrence rates accurately and restrict the time range to exclude Jupiter’s effect.

Code

all_events_or_N1_cleaned = (
    all_events_or_N1.sort("time")
    .pipe(filter_before_jupiter)
    .filter(pl.col("o_rates") > 5)
    .upsample("time", every=avg_window, by="sat", maintain_order=True)
    .with_columns(pl.col("sat").forward_fill())
)

all_events_or_N2_cleaned = n2_normalize(all_events_or_N1_cleaned)

Code

%%R -i all_events_or_N1_cleaned -c conv_pl
p <- plot_or_time(all_events_or_N1_cleaned)
# save_plot("ocr/ocr_time_cleaned")
p

[12/04/23 09:07:11] WARNING  R[write to console]: In addition:                                     callbacks.py:124

                    WARNING  R[write to console]: Warning message:                                 callbacks.py:124

                    WARNING  R[write to console]: Removed 9 rows containing missing values         callbacks.py:124
                             (`geom_point()`).

Code

%%R -i all_events_or_N2_cleaned -c conv_pl
p <- plot_or_time(all_events_or_N2_cleaned)
p <- p + labs(y = "Normalized Occurrence Rates")
save_plot("ocr/ocr_time_N2_cleaned")
p

Saving 6.67 x 6.67 in image
Saving 6.67 x 6.67 in image

[12/04/23 09:16:01] WARNING  R[write to console]: In addition:                                     callbacks.py:124

                    WARNING  R[write to console]: Warning message:                                 callbacks.py:124

                    WARNING  R[write to console]: Removed 9 rows containing missing values         callbacks.py:124
                             (`geom_point()`).

In addition: Warning messages:
1: Removed 9 rows containing missing values (`geom_point()`). 
2: Removed 9 rows containing missing values (`geom_point()`).

Plot the occurrence rates with radial distance

Code

%%R
plot_or_r <- function(df, target_sat = "JNO") {
  "plot normalized occurrence rate over radial distance"
  
  # Filter data for target_sat
  df_target <- df[df$sat == target_sat,]
  
  # Compute the linear model
  fit <- lm(o_rates_normalized ~ I(1/ref_radial_distance), data = df_target)
  
  # Extract coefficients
  intercept <- coef(fit)[1]
  slope <- coef(fit)[2]
  
  # Format equation
  equation <- sprintf("y ~ %.2f / x", slope)
  
  p <- ggscatter(df, x = "ref_radial_distance", y = "o_rates_normalized", color = "sat") +
    geom_smooth(data = df_target, formula = y ~ I(1/x), method = "lm", color="gray", linetype="dashed")
    
  p <- p +
    labs(x = "Referred Radial Distance (AU)", y = "Occurrence Rate  (#/day)", color="Satellites") +
    annotate("text", label = equation, x = Inf, y = Inf, hjust = 1.1, vjust = 1.5, size = 7) +
    theme_pubr(base_size = 16) + 
    theme(legend.text = element_text(size=16)) +
    scale_color_okabeito(palette = "black_first")
  
  return(p)
}

Code

df = link_coord2dim(all_events_or_N1_cleaned).sort("ref_radial_distance")

Code

%%R -i df -c conv_pl
p <- plot_or_r(df)
print(p)
# save_plot("ocr_r_cleaned")

In addition: Warning messages:
1: Removed 4 rows containing non-finite values (`stat_smooth()`). 
2: Removed 21 rows containing missing values (`geom_point()`).

Code

df = link_coord2dim(all_events_or_N2_cleaned).sort("ref_radial_distance")

Code

%%R -i df -c conv_pl
#| code-summary: R Code
p <- plot_or_r(df) + labs(y = "Normalized Occurrence Rates")
print(p)
# save_plot("ocr/ocr_r_N2_cleaned")

In addition: Warning messages:
1: Removed 4 rows containing non-finite values (`stat_smooth()`). 
2: Removed 21 rows containing missing values (`geom_point()`).

Appendix

Obsolete codes

Notes: seaborn.lineplot drops nans from the DataFrame before plotting, this is not desired…

Code

import seaborn as sns

import matplotlib.pyplot as plt
import scienceplots
from discontinuitypy.utils.plot import savefig

Code

plt.style.use(['science', 'nature', 'notebook'])

Code


def plot_or_r(df: pl.DataFrame):
    "plot normalized occurence rate over radial distance"

    sns.lineplot(x="ref_radial_distance", y="o_rates_normalized", hue="sat", data=df)

    ax = plt.gca()  # Get current axis
    ax.set_yscale("log")
    ax.set_xlabel("Referred Radial Distance (AU)")
    ax.set_ylabel("Normalized Occurrence Rate")
    # savefig('occurrence_rate_ratio')

    return ax.figure

Code

def plot_or_time(df: pl.DataFrame):
    """Plot the occurence rate of the candidates with time.    
    """
    # Create a unique list of all satellites and sort them to let JNO' be plotted first
    all_sats = df["sat"].unique().to_list()
    all_sats.sort(key=lambda x: x != "JNO")

    # Plot each satellite separately
    for sat in all_sats:
        sat_df = df.filter(sat=sat)
        if sat == "JNO":
            sns.lineplot(sat_df, x="time", y="o_rates_normalized", label=sat)
        else:
            # Making the other satellites more distinct with linestyle and alpha
            sns.lineplot(
                sat_df,
                x="time",
                y="o_rates_normalized",
                linestyle="--",  # dashed line style
                alpha=0.5,  # keep the order of the legend
                label=sat,
            )

    ax = plt.gca()  # Get current axis
    # Set the y-axis and x-axis labels
    ax.set_ylabel("Occurrence Rates (#/day)")
    ax.set_xlabel("Date")
    ax.legend(title="Satellites")

    # savefig("occurrence_rates")
    return ax.figure

Code

fig = plot_or_time(
    all_events_or_N1
)

STEREO-A anomaly during 2014-08 period

Code

mag: pl.DataFrame = catalog.load('sta.inter_mag_1s').filter(pl.col('B')>0).collect()

Code

import plotly.graph_objects as go;
from plotly_resampler import FigureResampler
import plotly.express as px

Code

# px.line(mag, x='time', y='B') # This is extremely slow for large datasets

fig = FigureResampler(go.Figure())
fig.add_trace({"x": mag["time"], "y": mag['B']})
fig