Flying insect activity-levels analysis¶

This notebook demonstrates analysis of flying insect activity levels data from camfi.

First, load the required libraries.

from datetime import datetime, timedelta, timezone
import itertools
import textwrap

from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf

from camfi.projectconfig import CamfiConfig
from camfi.plotting.matplotlib import (
    plot_activity_levels_summary,
    plot_activity_levels_summaries,
    plot_maelstroms_with_temperature,
)

Next, load the image metadata (including annotations from VIA). This file should have the corrected image capture timestamps (refer to the Wingbeat Analysis notebook).

config_path = "data/cabramurra_config.yml"

config = CamfiConfig.parse_yaml_file(config_path)

# We can print out our config using config.json() or config.yaml()
print(config.json(exclude_unset=True, indent=2))

{
  "root": "data",
  "via_project_file": "data/cabramurra_all_annotations.json",
  "day_zero": "2019-01-01",
  "output_tz": "+10:00",
  "filters": {
    "image_filters": {
      "min_annotations": 1
    }
  },
  "camera": {
    "camera_time_to_actual_time_ratio": 1.0,
    "line_rate": 90500.0
  },
  "time": {
    "camera_placements": {
      "2019-11_cabramurra/0004": {
        "camera_start_time": "2019-10-14T13:00:00+11:00",
        "actual_start_time": "2019-11-14T13:00:00+11:00",
        "location": "cabramurra"
      },
      "2019-11_cabramurra": {
        "camera_start_time": "2019-11-14T13:00:00+11:00",
        "location": "cabramurra"
      }
    }
  },
  "place": {
    "locations": [
      {
        "name": "cabramurra",
        "lat": -35.9507,
        "lon": 148.3972,
        "elevation_m": 1513.9,
        "tz": "+10:00"
      }
    ],
    "weather_stations": [
      {
        "location": {
          "name": "cabramurra_smhea_aws_072161",
          "lat": -35.94,
          "lon": 148.38,
          "elevation_m": 1482.4,
          "tz": "+10:00"
        },
        "data_file": "data/cabramurra_bom_weather_201911.csv"
      }
    ],
    "location_weather_station_mapping": {
      "cabramurra": "cabramurra_smhea_aws_072161"
    }
  },
  "wingbeat_extraction": {
    "device": "cpu",
    "scan_distance": 50
  },
  "annotator": {
    "crop": {
      "x0": 0,
      "y0": 0,
      "x1": 4608,
      "y1": 3312
    },
    "training": {
      "mask_maker": {
        "shape": [
          3312,
          4608
        ],
        "mask_dilate": 5
      },
      "min_annotations": 1,
      "max_annotations": 50,
      "test_set_file": "data/cabramurra_test_set.txt",
      "device": "cuda",
      "batch_size": 5,
      "num_workers": 2,
      "num_epochs": 15,
      "outdir": "data",
      "save_intermediate": true
    },
    "inference": {
      "output_path": "data/cabramurra_autoannotated.json",
      "device": "cuda",
      "backup_device": "cpu",
      "score_thresh": 0.0
    },
    "validation": {
      "autoannotated_via_project_file": "data/cabramurra_autoannotated.json",
      "image_sets": [
        "all",
        "test",
        "train"
      ],
      "output_dir": "data"
    }
  }
}

To get the timestamps for the images, we need to read the EXIF metadata from the image files. Here we also apply time correction. The code is commented out since the metadata has already been loaded into "data/cabramurra_all_annotations.json", but if you are working with a different dataset, or would like to re-run IO intensive this step, uncomment the code. This could also be achieved from the command-line interface to Camfi.

Note: It is assumed you have downloaded and extracted the images to "data/". Of course you can extract it elsewhere and change root config variable accordingly. The repository containing the images used in this example can be found here: https://doi.org/10.5281/zenodo.4950570.

# Uncomment if exif metadata hasn't been loaded already.
# config.load_all_exif_metadata()

After running the above two steps, you might like to save the results to a new VIA project file. Uncommenting the following will save a new VIA project file to "data/all_annotations_with_metadata.json".

# with open("data/all_annotations_with_metadata.json", "w") as f:
#     f.write(config.via_project.json(indent=2, exclude_unset=True))

For the following analyses, we need a Pandas DataFrame. The folowwing command builds a dataframe with an entry for each image, with data taken from sources specified in the config. We will also convert all the datetime_corrected values to AEST (+10:00, set in config file) - in our data they are in AEDT (+11:00).

df = config.get_merged_dataframe()
df

		img_key	filename	n_annotations	datetime_corrected	datetime_original	exposure_time	pixel_x_dimension	pixel_y_dimension	astronomical_twilight_start	nautical_twilight_start	...	maximum_wind_gust_time	temperature_9am_degC	relative_humidity_9am_pc	cloud_amount_9am_oktas	wind_direction_9am	wind_speed_9am_kph	temperature_3pm_degC	relative_humidity_3pm_pc	wind_direction_3pm	wind_speed_3pm_kph
location	date
cabramurra	2019-11-14	2019-11_cabramurra/0001/DSCF0001.JPG-1	2019-11_cabramurra/0001/DSCF0001.JPG	0	2019-11-14 18:00:03+10:00	2019-11-14 19:00:03	0.012048	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	23:04	4.9	95	3.0	W	17	10.1	73	W	22
	2019-11-14	2019-11_cabramurra/0001/DSCF0002.JPG-1	2019-11_cabramurra/0001/DSCF0002.JPG	0	2019-11-14 18:10:06+10:00	2019-11-14 19:10:06	0.009174	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	23:04	4.9	95	3.0	W	17	10.1	73	W	22
	2019-11-14	2019-11_cabramurra/0001/DSCF0003.JPG-1	2019-11_cabramurra/0001/DSCF0003.JPG	0	2019-11-14 18:20:09+10:00	2019-11-14 19:20:09	0.012048	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	23:04	4.9	95	3.0	W	17	10.1	73	W	22
	2019-11-14	2019-11_cabramurra/0001/DSCF0004.JPG-1	2019-11_cabramurra/0001/DSCF0004.JPG	0	2019-11-14 18:30:11+10:00	2019-11-14 19:30:11	0.020833	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	23:04	4.9	95	3.0	W	17	10.1	73	W	22
	2019-11-14	2019-11_cabramurra/0001/DSCF0005.JPG-1	2019-11_cabramurra/0001/DSCF0005.JPG	0	2019-11-14 18:40:14+10:00	2019-11-14 19:40:14	0.033333	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	23:04	4.9	95	3.0	W	17	10.1	73	W	22
	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
	2019-11-26	2019-11_cabramurra/0010/DSCF0860.JPG-1	2019-11_cabramurra/0010/DSCF0860.JPG	0	2019-11-26 05:13:26+10:00	2019-11-26 06:13:26	0.033333	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	13:34	13.6	55	8.0	NNW	41	3.1	99	WNW	35
	2019-11-26	2019-11_cabramurra/0010/DSCF0861.JPG-1	2019-11_cabramurra/0010/DSCF0861.JPG	0	2019-11-26 05:23:29+10:00	2019-11-26 06:23:29	0.033333	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	13:34	13.6	55	8.0	NNW	41	3.1	99	WNW	35
	2019-11-26	2019-11_cabramurra/0010/DSCF0862.JPG-1	2019-11_cabramurra/0010/DSCF0862.JPG	0	2019-11-26 05:33:31+10:00	2019-11-26 06:33:31	0.023810	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	13:34	13.6	55	8.0	NNW	41	3.1	99	WNW	35
	2019-11-26	2019-11_cabramurra/0010/DSCF0863.JPG-1	2019-11_cabramurra/0010/DSCF0863.JPG	0	2019-11-26 05:43:34+10:00	2019-11-26 06:43:34	0.018182	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	13:34	13.6	55	8.0	NNW	41	3.1	99	WNW	35
	2019-11-26	2019-11_cabramurra/0010/DSCF0864.JPG-1	2019-11_cabramurra/0010/DSCF0864.JPG	0	2019-11-26 05:53:37+10:00	2019-11-26 06:53:37	0.013889	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	13:34	13.6	55	8.0	NNW	41	3.1	99	WNW	35

8640 rows × 36 columns

To get a general overview of the activity levels observed throughout the study period, we pool the data from all cameras and plot the number of annotations per 10-minute interval:

# Setting frame of reference and
# adding a daynumber column to df, for simpler plots
df["daynumber"] = (
    df["datetime_corrected"] - config.timestamp_zero
).dt.total_seconds() / 86400

location_names = [location.name for location in config.place.locations]
fig = plot_activity_levels_summaries(
    df,
    location_names,
    x_column="daynumber",
    bin_width=10 / 1440,   # 10 minutes
    ax_kwargs=dict(
        ylabel="Number of annotations in 10 min bins",
        xlabel="Day Number"
    ),
    c="k",
)

../../_images/activity_analysis_12_0.png

The gaps in the above figure are periods where the cameras were not set to take photos (they were only set to take photos between the hours of 19:00-07:00 AEDT each night).

There seems to be a periodic signal in the data, with more activity in the evening. We can take a closer look at this by pooling the data from all days into a single representative 24-hour period.

df["dayhour"] = (df["daynumber"] - np.floor(df["daynumber"])) * 24.

fig = plot_activity_levels_summaries(
    df,
    location_names,
    x_column="dayhour",
    bin_width=10 / 60,   # 10 minutes
    ax_kwargs=dict(
        ylabel="Number of annotations in 10 min bins",
        xlabel="Time of day (h)"
    ),
    c="k",
)

../../_images/activity_analysis_14_0.png

In the above figure we see a striking increase in activity levels during the hours of 19:20-20:20. This seems to be when the most insects are flying.

Instead of binning this data by absolute time of day, it would be nice to bin it according to the relative time from astronomical events, eg. sunset.

Here we calculate a “within_twilight” column, which is time after sunset, scaled to the duration of twilight. We’ll also calculate a “daylight_hours” column, which will be used later.

df["within_twilight"] = (
    df["datetime_corrected"] - df["sunset"]
) / (
    df["astronomical_twilight_end"] - df["sunset"]
)
df["daylight_hours"] = (df["sunset"] - df["sunrise"]).dt.total_seconds() / 3600

df

		img_key	filename	n_annotations	datetime_corrected	datetime_original	exposure_time	pixel_x_dimension	pixel_y_dimension	astronomical_twilight_start	nautical_twilight_start	...	wind_direction_9am	wind_speed_9am_kph	temperature_3pm_degC	relative_humidity_3pm_pc	wind_direction_3pm	wind_speed_3pm_kph	daynumber	dayhour	within_twilight	daylight_hours
location	date
cabramurra	2019-11-14	2019-11_cabramurra/0001/DSCF0001.JPG-1	2019-11_cabramurra/0001/DSCF0001.JPG	0	2019-11-14 18:00:03+10:00	2019-11-14 19:00:03	0.012048	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	W	17	10.1	73	W	22	317.750035	18.000833	-0.504471	13.978719
	2019-11-14	2019-11_cabramurra/0001/DSCF0002.JPG-1	2019-11_cabramurra/0001/DSCF0002.JPG	0	2019-11-14 18:10:06+10:00	2019-11-14 19:10:06	0.009174	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	W	17	10.1	73	W	22	317.757014	18.168333	-0.403632	13.978719
	2019-11-14	2019-11_cabramurra/0001/DSCF0003.JPG-1	2019-11_cabramurra/0001/DSCF0003.JPG	0	2019-11-14 18:20:09+10:00	2019-11-14 19:20:09	0.012048	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	W	17	10.1	73	W	22	317.763993	18.335833	-0.302793	13.978719
	2019-11-14	2019-11_cabramurra/0001/DSCF0004.JPG-1	2019-11_cabramurra/0001/DSCF0004.JPG	0	2019-11-14 18:30:11+10:00	2019-11-14 19:30:11	0.020833	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	W	17	10.1	73	W	22	317.770961	18.503056	-0.202121	13.978719
	2019-11-14	2019-11_cabramurra/0001/DSCF0005.JPG-1	2019-11_cabramurra/0001/DSCF0005.JPG	0	2019-11-14 18:40:14+10:00	2019-11-14 19:40:14	0.033333	4608	3456	2019-11-14 03:12:20.886936+10:00	2019-11-14 03:49:05.935004+10:00	...	W	17	10.1	73	W	22	317.777940	18.670556	-0.101282	13.978719
	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
	2019-11-26	2019-11_cabramurra/0010/DSCF0860.JPG-1	2019-11_cabramurra/0010/DSCF0860.JPG	0	2019-11-26 05:13:26+10:00	2019-11-26 06:13:26	0.033333	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	NNW	41	3.1	99	WNW	35	329.217662	5.223889	-7.947638	14.293727
	2019-11-26	2019-11_cabramurra/0010/DSCF0861.JPG-1	2019-11_cabramurra/0010/DSCF0861.JPG	0	2019-11-26 05:23:29+10:00	2019-11-26 06:23:29	0.033333	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	NNW	41	3.1	99	WNW	35	329.224641	5.391389	-7.851293	14.293727
	2019-11-26	2019-11_cabramurra/0010/DSCF0862.JPG-1	2019-11_cabramurra/0010/DSCF0862.JPG	0	2019-11-26 05:33:31+10:00	2019-11-26 06:33:31	0.023810	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	NNW	41	3.1	99	WNW	35	329.231609	5.558611	-7.755108	14.293727
	2019-11-26	2019-11_cabramurra/0010/DSCF0863.JPG-1	2019-11_cabramurra/0010/DSCF0863.JPG	0	2019-11-26 05:43:34+10:00	2019-11-26 06:43:34	0.018182	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	NNW	41	3.1	99	WNW	35	329.238588	5.726111	-7.658763	14.293727
	2019-11-26	2019-11_cabramurra/0010/DSCF0864.JPG-1	2019-11_cabramurra/0010/DSCF0864.JPG	0	2019-11-26 05:53:37+10:00	2019-11-26 06:53:37	0.013889	4608	3456	2019-11-26 03:00:54.332543+10:00	2019-11-26 03:39:54.781647+10:00	...	NNW	41	3.1	99	WNW	35	329.245567	5.893611	-7.562418	14.293727

8640 rows × 40 columns

We can now plot these data. This also plots a separate trace for each location (whereas the above plot only plots one trace for all the data).

fig = plot_activity_levels_summaries(
    df,
    location_names,
    x_column="within_twilight",
    bin_width=0.1,   # 10th of twilight duration
    ax_kwargs=dict(
        title="Plot of number of annotations relative to twilight",
        ylabel="Number of annotations",
        xlabel="Twilight durations since sunset"
    ),
    separate_plots=False,
    c="k",
)
twilight_vspan = fig.axes[0].axvspan(0, 1, alpha=0.3, label="Evening twilight")
legend = fig.axes[0].legend()

../../_images/activity_analysis_18_0.png

The moths appear to be flying during twilight. So lets select only those time points to quantify daily maelstrom intensity. Taking the sum of annotations during this period for each day, we can then look at how activity levels were across the days of the study period.

# We're only interested in the numeric columns
keep_dtypes = set(
    np.dtype("".join(t)) for t in itertools.product("ifu", "248")
)
keep_cols = filter(lambda c: df[c].dtype in keep_dtypes, df.columns)

# But even some of the numeric columns are irrelevant after aggregation
drop_cols = {
    "exposure_time",
    "pixel_x_dimension",
    "pixel_y_dimension",
    "dayhour",
    "within_twilight",
}
keep_cols = filter(lambda c: c not in drop_cols, keep_cols)

# For most of the columns, we just want to take the mean value
aggregation_functions = {column: (column, "mean") for column in keep_cols}

# But for "n_annotations", we want to take the sum
aggregation_functions["n_annotations"] = ("n_annotations", "sum")

# And we want to truncate "daynumber" to an integer
aggregation_functions["daynumber"] = ("daynumber", lambda x: int(x[0]))

# We also want to make a new column with the exposure (number of images
# taken) during each twilight interval.
aggregation_functions["exposures"] = ("n_annotations", "count")

# Now we select the data which was obtained during twilight,
# and group it by location and date, aggregating using the above-defined
# aggregation functions. This leaves us with a DataFrame with one row per
# date for each location in the study.
maelstrom_df = df[
    (df["within_twilight"] <= 1.) & (df["within_twilight"] >= 0)
].groupby(["location", "date"]).aggregate(**aggregation_functions)
maelstrom_df

		n_annotations	lat	lon	elevation_m	temperature_minimum_degC	temperature_minimum_evening_degC	temperature_maximum_degC	rainfall_mm	maximum_wind_gust_speed_kph	temperature_9am_degC	relative_humidity_9am_pc	cloud_amount_9am_oktas	wind_speed_9am_kph	temperature_3pm_degC	relative_humidity_3pm_pc	wind_speed_3pm_kph	daynumber	daylight_hours	exposures
location	date
cabramurra	2019-11-14	80	-35.9507	148.3972	1513.9	-0.6	4.7	12.3	0.0	41.0	4.9	95.0	3.0	17.0	10.1	73.0	22.0	317	13.978719	90
	2019-11-15	42	-35.9507	148.3972	1513.9	4.7	3.7	13.8	0.0	70.0	8.3	73.0	2.0	31.0	12.8	53.0	31.0	318	14.007941	100
	2019-11-16	35	-35.9507	148.3972	1513.9	3.7	3.3	14.3	0.0	48.0	5.9	69.0	NaN	11.0	13.2	35.0	20.0	319	14.036682	100
	2019-11-17	50	-35.9507	148.3972	1513.9	3.3	4.5	14.3	0.0	33.0	8.4	42.0	0.0	7.0	13.4	26.0	13.0	320	14.064925	100
	2019-11-18	79	-35.9507	148.3972	1513.9	4.5	7.5	16.0	0.0	43.0	7.6	45.0	2.0	15.0	14.5	35.0	24.0	321	14.092652	100
	2019-11-19	105	-35.9507	148.3972	1513.9	7.5	11.0	21.7	0.0	65.0	13.0	41.0	0.0	30.0	20.6	22.0	30.0	322	14.119846	100
	2019-11-20	59	-35.9507	148.3972	1513.9	11.0	15.0	23.1	0.0	31.0	15.0	31.0	1.0	7.0	21.3	23.0	17.0	323	14.146488	100
	2019-11-21	117	-35.9507	148.3972	1513.9	15.0	17.2	27.6	0.0	50.0	22.0	27.0	1.0	13.0	27.3	22.0	22.0	324	14.172561	100
	2019-11-22	33	-35.9507	148.3972	1513.9	17.2	11.3	22.6	0.0	54.0	19.8	34.0	1.0	15.0	21.7	39.0	26.0	325	14.198045	110
	2019-11-23	40	-35.9507	148.3972	1513.9	11.3	8.2	20.0	0.0	43.0	13.9	36.0	0.0	15.0	18.5	28.0	22.0	326	14.222923	110
	2019-11-24	43	-35.9507	148.3972	1513.9	8.2	11.8	19.8	0.0	41.0	13.0	60.0	0.0	9.0	18.2	40.0	22.0	327	14.247175	100
	2019-11-25	36	-35.9507	148.3972	1513.9	11.8	13.0	21.9	0.0	50.0	13.9	33.0	0.0	20.0	20.4	26.0	28.0	328	14.270782	100

Now we can plot these data:

fig = plt.figure()
ax = fig.add_subplot(
    111,
    title="Moth maelstrom activity at Cabramurra boulder field\nwith daily temperature records",
    ylabel="Number of annotations during maelstrom",
    xlabel="Day number",
)
lines = plot_maelstroms_with_temperature(
    maelstrom_df,
    ax,
    maelstrom_kwargs=dict(
        c="k",
        lw=3,
    ),
)

../../_images/activity_analysis_22_0.png

We then may like to regress the activity levels against various factors. Given the activity level count data, we can proceed using a Poisson regression of n_annotations vs. the independent variables of interest.

First, we will select non-correlated covariates from maelstrom_df. Here we can add derived covariates, such as temperature_range and dewpoint_degC.

We also define a set drop of columns not to include as covariates.

maelstrom_df["temperature_range"] = maelstrom_df.temperature_maximum_degC - maelstrom_df.temperature_minimum_evening_degC
# maelstrom_df["dewpoint_3pm_degC"] = mpcalc.dewpoint_from_relative_humidity(
#     units.Quantity(maelstrom_df["temperature_3pm_degC"].array, "degC"),
#     units.Quantity(maelstrom_df["relative_humidity_3pm_pc"].array, "percent"),
# )

drop = {
    "n_annotations",           # Variable of interest
    "rainfall_mm",             # All zero in this dataset
    "cloud_amount_9am_oktas",  # Hass missing data
    "exposures",               # Exposure variable
    "lat",
    "lon",                     # All locations are the same
    "elevation_m",             # in this example
}
covariates = list(filter(lambda c: c not in drop, maelstrom_df.columns))

We can then plot scatter plots of all of the covariates.

grr = pd.plotting.scatter_matrix(
    maelstrom_df[["n_annotations", *covariates]],
    marker="o",
    figsize=(15, 15),
)
for ax in grr[-1, :]:
    ax.set_xlabel(ax.get_xlabel(), rotation=45, ha="right")
for ax in grr[:, 0]:
    ax.set_ylabel(ax.get_ylabel(), rotation=45, ha="right")

../../_images/activity_analysis_26_0.png

Some of these look like they are correlated, so we should remove them from the list of covariates. We start by finding the pearson correlation between each of the varibles.

Note that the below dataframe has ones on the leading diagonal. Strictly speaking, these should be zeros (but leaving them as ones happens to be more convenient in this case).

correlation_p_vals = maelstrom_df[covariates].corr(
    method=lambda x, y: sp.stats.pearsonr(x, y)[1]
)

n_annotations_corr = maelstrom_df[covariates].corrwith(
    maelstrom_df["n_annotations"]
)

correlation_p_vals

	temperature_minimum_degC	temperature_minimum_evening_degC	temperature_maximum_degC	maximum_wind_gust_speed_kph	temperature_9am_degC	relative_humidity_9am_pc	wind_speed_9am_kph	temperature_3pm_degC	relative_humidity_3pm_pc	wind_speed_3pm_kph	daynumber	daylight_hours	temperature_range
temperature_minimum_degC	1.000000	0.002116	7.228477e-05	0.726410	0.000001	0.002227	0.765180	5.734440e-05	0.061840	0.583986	2.849026e-03	2.297827e-03	0.200886
temperature_minimum_evening_degC	0.002116	1.000000	1.477610e-06	0.773894	0.000269	0.009102	0.598260	1.022308e-05	0.059156	0.843996	8.241975e-03	7.022097e-03	0.768792
temperature_maximum_degC	0.000072	0.000001	1.000000e+00	0.886576	0.000003	0.001713	0.766278	1.321696e-11	0.019935	0.710210	6.060147e-03	4.804292e-03	0.518073
maximum_wind_gust_speed_kph	0.726410	0.773894	8.865758e-01	1.000000	0.765853	0.708056	0.000074	7.875770e-01	0.711903	0.000115	7.235197e-01	7.176197e-01	0.134835
temperature_9am_degC	0.000001	0.000269	2.677875e-06	0.765853	1.000000	0.003710	0.752917	1.057270e-06	0.069890	0.712510	9.914545e-03	8.194690e-03	0.306516
relative_humidity_9am_pc	0.002227	0.009102	1.712610e-03	0.708056	0.003710	1.000000	0.518422	1.337258e-03	0.000111	0.789428	1.408504e-02	1.071846e-02	0.290696
wind_speed_9am_kph	0.765180	0.598260	7.662779e-01	0.000074	0.752917	0.518422	1.000000	8.017529e-01	0.494823	0.000090	5.306412e-01	5.172520e-01	0.463215
temperature_3pm_degC	0.000057	0.000010	1.321696e-11	0.787577	0.000001	0.001337	0.801753	1.000000e+00	0.015457	0.700807	9.167460e-03	7.319337e-03	0.405187
relative_humidity_3pm_pc	0.061840	0.059156	1.993542e-02	0.711903	0.069890	0.000111	0.494823	1.545673e-02	1.000000	0.566386	6.486753e-02	5.388663e-02	0.244309
wind_speed_3pm_kph	0.583986	0.843996	7.102099e-01	0.000115	0.712510	0.789428	0.000090	7.008068e-01	0.566386	1.000000	7.355078e-01	7.546994e-01	0.552350
daynumber	0.002849	0.008242	6.060147e-03	0.723520	0.009915	0.014085	0.530641	9.167460e-03	0.064868	0.735508	1.000000e+00	2.993460e-16	0.750526
daylight_hours	0.002298	0.007022	4.804292e-03	0.717620	0.008195	0.010718	0.517252	7.319337e-03	0.053887	0.754699	2.993460e-16	1.000000e+00	0.723849
temperature_range	0.200886	0.768792	5.180730e-01	0.134835	0.306516	0.290696	0.463215	4.051872e-01	0.244309	0.552350	7.505260e-01	7.238494e-01	1.000000

We remove correlated variables in a greedy fashion, recursively selecting the most significantly correlated pair of variables, and removing the one which is not as well correlated with “n_annotations”. We’ll use a stringent p-value cutoff of 0.001 to make sure we don’t throw away any important variables.

def greedy_variate_removal(correlation_p_vals, p_cutoff, scores):
    r, c = divmod(np.argmin(correlation_p_vals), len(correlation_p_vals))

    if correlation_p_vals.iloc[r, c] >= p_cutoff:  # We're done!
        return correlation_p_vals

    drop_label = correlation_p_vals.index[
        [r, c][
            scores[correlation_p_vals.index[[r, c]]].argmin()
        ]
    ]
    correlation_p_vals = correlation_p_vals.drop(index=drop_label)
    correlation_p_vals.drop(columns=drop_label, inplace=True)

    return greedy_variate_removal(correlation_p_vals, p_cutoff, scores)


correlation_p_vals = greedy_variate_removal(
    correlation_p_vals, 0.001, n_annotations_corr
)
filtered_covariates = list(correlation_p_vals.index)
correlation_p_vals

	temperature_minimum_degC	temperature_minimum_evening_degC	relative_humidity_9am_pc	wind_speed_9am_kph	daylight_hours	temperature_range
temperature_minimum_degC	1.000000	0.002116	0.002227	0.765180	0.002298	0.200886
temperature_minimum_evening_degC	0.002116	1.000000	0.009102	0.598260	0.007022	0.768792
relative_humidity_9am_pc	0.002227	0.009102	1.000000	0.518422	0.010718	0.290696
wind_speed_9am_kph	0.765180	0.598260	0.518422	1.000000	0.517252	0.463215
daylight_hours	0.002298	0.007022	0.010718	0.517252	1.000000	0.723849
temperature_range	0.200886	0.768792	0.290696	0.463215	0.723849	1.000000

With this new filtered list of covariates, we again plot the pairs to make sure that everything looks nice and uncorrelated.

grr = pd.plotting.scatter_matrix(
    maelstrom_df[["n_annotations", *filtered_covariates]],
    marker="o",
    figsize=(15, 15),
)
for ax in grr[-1, :]:
    ax.set_xlabel(ax.get_xlabel(), rotation=45, ha="right")
for ax in grr[:, 0]:
    ax.set_ylabel(ax.get_ylabel(), rotation=45, ha="right")

../../_images/activity_analysis_32_0.png

Fitting a Poisson GLM of n_annotations vs. each covariate individually, and plotting the effect:

pois = sm.families.Poisson()

tvalues = []
pvalues = []
for covariate in filtered_covariates:
    mod = smf.glm(
        f"n_annotations ~ {covariate}",
        data=maelstrom_df,
        family=pois,
        exposure=maelstrom_df["exposures"],
    )
    res = mod.fit()
    tvalues.append(res.tvalues[1])
    pvalues.append(res.pvalues[1])

tvalues = np.array(tvalues)
pvalues = np.array(pvalues)

ordering = np.argsort(tvalues)
coloring = np.array(["r", "b"])[(pvalues[ordering] < 0.05).astype("u1")]

fig = plt.figure()
ax = fig.add_subplot(
    111,
    title="Single main effect plot",
    xlabel="Scaled estimate",
    ylabel="Variable",
)
ax.axvline(0, c="gray")
ax.hlines(
    np.array(filtered_covariates)[ordering],
    tvalues[ordering] - 1.96,
    tvalues[ordering] + 1.96,
    color=coloring
)
p = ax.scatter(
    tvalues[ordering],
    np.array(filtered_covariates)[ordering],
    color=coloring
)

significant_single_effect_variables = set(np.array(filtered_covariates)[pvalues < 0.05])
print("Significant single-effect variables:", *significant_single_effect_variables, sep="\n  - ")

Significant single-effect variables:
  - daylight_hours
  - temperature_minimum_evening_degC
  - temperature_range
  - wind_speed_9am_kph

../../_images/activity_analysis_34_1.png

Fitting a Poisson GLM of n_annotations vs. all of the covariates, and plotting the effect:

mod = smf.glm(
    "n_annotations ~ " + " + ".join(filtered_covariates),
    data=maelstrom_df,
    family=pois,
    exposure=maelstrom_df["exposures"],
)
res = mod.fit()
print(res.summary())
print(f"res.aic={res.aic}")

ordering = np.argsort(res.tvalues)
coloring = np.array(["r", "b"])[
    (res.pvalues[ordering] < 0.05).astype("u1")
]

fig = plt.figure()
ax = fig.add_subplot(
    111,
    xlabel="Scaled estimate",
    ylabel="Variable",
)
ax.axvline(0, c="gray")
ax.hlines(
    res.tvalues.index[ordering],
    res.tvalues[ordering] - 1.96,
    res.tvalues[ordering] + 1.96,
    color=coloring,
)
p = ax.scatter(res.tvalues[ordering], res.tvalues.index[ordering], color=coloring)

significant_mixed_effect_variables = set(res.tvalues.index[res.pvalues < 0.05])

print("\nSignificant mixed-effect variables:", *significant_mixed_effect_variables, sep="\n  - ")

                 Generalized Linear Model Regression Results
==============================================================================
Dep. Variable:          n_annotations   No. Observations:                   12
Model:                            GLM   Df Residuals:                        5
Model Family:                 Poisson   Df Model:                            6
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -47.274
Date:                Mon, 09 Aug 2021   Deviance:                       24.473
Time:                        14:47:53   Pearson chi2:                     24.2
No. Iterations:                     4
Covariance Type:            nonrobust
====================================================================================================
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercept                           41.3617     11.516      3.592      0.000      18.790      63.933
temperature_minimum_degC            -0.1297      0.025     -5.230      0.000      -0.178      -0.081
temperature_minimum_evening_degC     0.1897      0.022      8.543      0.000       0.146       0.233
relative_humidity_9am_pc            -0.0017      0.004     -0.468      0.640      -0.009       0.005
wind_speed_9am_kph                   0.0062      0.006      1.120      0.263      -0.005       0.017
daylight_hours                      -3.1266      0.806     -3.881      0.000      -4.705      -1.548
temperature_range                    0.1548      0.052      3.000      0.003       0.054       0.256
====================================================================================================
res.aic=108.54753196329011

Significant mixed-effect variables:
  - temperature_minimum_degC
  - daylight_hours
  - temperature_minimum_evening_degC
  - Intercept
  - temperature_range

../../_images/activity_analysis_36_1.png

From the above plot, we can see that some of the variables do not have a significant effect on “n_annotations”. So let’s remove them and see if we get a better model.

sig_eff_variables = list(significant_mixed_effect_variables - {"Intercept"})

mod = smf.glm(
    "n_annotations ~ " + " + ".join(sig_eff_variables),
    data=maelstrom_df,
    family=pois,
    exposure=maelstrom_df["exposures"],
)
res = mod.fit()
print(res.summary())
print(f"res.aic={res.aic}")

ordering = np.argsort(res.tvalues)
coloring = np.array(["r", "b"])[
    (res.pvalues[ordering] < 0.05).astype("u1")
]

fig = plt.figure()
ax = fig.add_subplot(
    111,
    xlabel="Scaled estimate",
    ylabel="Variable",
)
ax.axvline(0, c="gray")
ax.hlines(
    res.tvalues.index[ordering],
    res.tvalues[ordering] - 1.96,
    res.tvalues[ordering] + 1.96,
    color=coloring,
)
p = ax.scatter(res.tvalues[ordering], res.tvalues.index[ordering], color=coloring)

                 Generalized Linear Model Regression Results
==============================================================================
Dep. Variable:          n_annotations   No. Observations:                   12
Model:                            GLM   Df Residuals:                        7
Model Family:                 Poisson   Df Model:                            4
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -47.915
Date:                Mon, 09 Aug 2021   Deviance:                       25.756
Time:                        14:47:53   Pearson chi2:                     25.3
No. Iterations:                     4
Covariance Type:            nonrobust
====================================================================================================
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercept                           39.6614     10.961      3.618      0.000      18.178      61.144
temperature_minimum_degC            -0.1381      0.024     -5.790      0.000      -0.185      -0.091
temperature_minimum_evening_degC     0.1997      0.020     10.108      0.000       0.161       0.238
daylight_hours                      -3.0297      0.775     -3.909      0.000      -4.549      -1.511
temperature_range                    0.1882      0.041      4.562      0.000       0.107       0.269
====================================================================================================
res.aic=105.83037074639384

../../_images/activity_analysis_38_1.png

Removing the variables from the model lowered AIC, meaning that not much information was lost, while reducing the risk of overfitting. We will proceed with this model.

To check that nothing strange is going on, we can plot the residuals:

fig = plt.figure()
ax = fig.add_subplot(
    111,
    title="Residual plot for:\n" + "\n".join(textwrap.wrap(mod.formula, width=55)),
    ylabel="Pearson residual",
    xlabel="Fitted values (n_annotations)",
)

ax.axhline(0, c="gray")
p = ax.scatter(res.fittedvalues, res.resid_pearson, marker="x", c="k")

../../_images/activity_analysis_40_0.png

Finally, we can combine the summary abundance/activity plots from above, this time including the predicted values for “n_annotations” (and confidence interval) into the maelstrom plot.

prediction = res.get_prediction()

# Set up figure
fig = plt.figure(
    figsize=(7.5, 5.2),
    tight_layout=True,
)
title_y = 0.87
title_fontdict = {"fontweight": "bold"}

# Define each subplot
# General summary plot
ax1 = fig.add_subplot(
    221,
    xlabel="Day Number",
    ylabel="Number of annotations",
)
ax1.set_title(
    " (a)", fontdict=title_fontdict, loc="left", y=title_y
)
# Twilight plot
ax2 = fig.add_subplot(
    222,
    xlabel="Twilight durations since sunset",
)
ax2.set_title(
    " (b)", fontdict=title_fontdict, loc="left", y=title_y
)
# Maelstrom plot with temperatures
ax3 = fig.add_subplot(
    212,
    xlabel="Day Number",
    ylabel="Number of annotations\nDuring twilight",
)
ax3.set_title(
    " (c)", fontdict=title_fontdict, loc="left", y=title_y
)

# Actual plotting
# General summary plot
plot_activity_levels_summary(
    df,
    ax1,
    x_column="daynumber",
    bin_width=10 / 1440,   # 10 minutes
    c="k",
    lw=0.75,
)

# Twilight plot
plot_activity_levels_summary(
    df,
    ax2,
    x_column="within_twilight",
    bin_width=0.1,   # 10th of twilight duration
    c="k",
    lw=0.75,
)
twilight_vspan = ax2.axvspan(
    0, 1, alpha=0.3, label="Evening twilight"
)

# Maelstrom plot with temperatures
plot_maelstroms_with_temperature(
    maelstrom_df,
    ax3,
    maelstrom_kwargs=dict(
        c="k",
        lw=3,
    ),
)

# Model prediction
ax3.plot(
    maelstrom_df["daynumber"],
    prediction.predicted_mean * maelstrom_df["exposures"],
    color="g",
    alpha=0.7,
    linestyle="dotted",
)
conf_int = prediction.conf_int()
ax3.fill_between(
    maelstrom_df["daynumber"],
    conf_int[:, 0] * maelstrom_df["exposures"],
    conf_int[:, 1] * maelstrom_df["exposures"],
    color="g",
    alpha=0.25,
    linestyle="dotted",
)

# Force x-axis ticks to be integers to make it prettier
for ax in fig.axes:
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))

../../_images/activity_analysis_42_0.png

fig.savefig("activity_levels_figure.pdf", dpi=600.0, pad_inches=0.0)