Flying insect activity-levels analysis

This notebook demonstrates analysis of flying insect activity levels data from camfi.

First, load the required libraries.

from datetime import datetime, timedelta, timezone
import itertools
import textwrap

from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf

from camfi.projectconfig import CamfiConfig
from camfi.plotting.matplotlib import (
    plot_activity_levels_summary,
    plot_activity_levels_summaries,
    plot_maelstroms_with_temperature,
)

Next, load the image metadata (including annotations from VIA). This file should have the corrected image capture timestamps (refer to the Wingbeat Analysis notebook).

config_path = "data/cabramurra_config.yml"

config = CamfiConfig.parse_yaml_file(config_path)

# We can print out our config using config.json() or config.yaml()
print(config.json(exclude_unset=True, indent=2))
{
  "root": "data",
  "via_project_file": "data/cabramurra_all_annotations.json",
  "day_zero": "2019-01-01",
  "output_tz": "+10:00",
  "filters": {
    "image_filters": {
      "min_annotations": 1
    }
  },
  "camera": {
    "camera_time_to_actual_time_ratio": 1.0,
    "line_rate": 90500.0
  },
  "time": {
    "camera_placements": {
      "2019-11_cabramurra/0004": {
        "camera_start_time": "2019-10-14T13:00:00+11:00",
        "actual_start_time": "2019-11-14T13:00:00+11:00",
        "location": "cabramurra"
      },
      "2019-11_cabramurra": {
        "camera_start_time": "2019-11-14T13:00:00+11:00",
        "location": "cabramurra"
      }
    }
  },
  "place": {
    "locations": [
      {
        "name": "cabramurra",
        "lat": -35.9507,
        "lon": 148.3972,
        "elevation_m": 1513.9,
        "tz": "+10:00"
      }
    ],
    "weather_stations": [
      {
        "location": {
          "name": "cabramurra_smhea_aws_072161",
          "lat": -35.94,
          "lon": 148.38,
          "elevation_m": 1482.4,
          "tz": "+10:00"
        },
        "data_file": "data/cabramurra_bom_weather_201911.csv"
      }
    ],
    "location_weather_station_mapping": {
      "cabramurra": "cabramurra_smhea_aws_072161"
    }
  },
  "wingbeat_extraction": {
    "device": "cpu",
    "scan_distance": 50
  },
  "annotator": {
    "crop": {
      "x0": 0,
      "y0": 0,
      "x1": 4608,
      "y1": 3312
    },
    "training": {
      "mask_maker": {
        "shape": [
          3312,
          4608
        ],
        "mask_dilate": 5
      },
      "min_annotations": 1,
      "max_annotations": 50,
      "test_set_file": "data/cabramurra_test_set.txt",
      "device": "cuda",
      "batch_size": 5,
      "num_workers": 2,
      "num_epochs": 15,
      "outdir": "data",
      "save_intermediate": true
    },
    "inference": {
      "output_path": "data/cabramurra_autoannotated.json",
      "device": "cuda",
      "backup_device": "cpu",
      "score_thresh": 0.0
    },
    "validation": {
      "autoannotated_via_project_file": "data/cabramurra_autoannotated.json",
      "image_sets": [
        "all",
        "test",
        "train"
      ],
      "output_dir": "data"
    }
  }
}

To get the timestamps for the images, we need to read the EXIF metadata from the image files. Here we also apply time correction. The code is commented out since the metadata has already been loaded into "data/cabramurra_all_annotations.json", but if you are working with a different dataset, or would like to re-run IO intensive this step, uncomment the code. This could also be achieved from the command-line interface to Camfi.

Note: It is assumed you have downloaded and extracted the images to "data/". Of course you can extract it elsewhere and change root config variable accordingly. The repository containing the images used in this example can be found here: https://doi.org/10.5281/zenodo.4950570.

# Uncomment if exif metadata hasn't been loaded already.
# config.load_all_exif_metadata()

After running the above two steps, you might like to save the results to a new VIA project file. Uncommenting the following will save a new VIA project file to "data/all_annotations_with_metadata.json".

# with open("data/all_annotations_with_metadata.json", "w") as f:
#     f.write(config.via_project.json(indent=2, exclude_unset=True))

For the following analyses, we need a Pandas DataFrame. The folowwing command builds a dataframe with an entry for each image, with data taken from sources specified in the config. We will also convert all the datetime_corrected values to AEST (+10:00, set in config file) - in our data they are in AEDT (+11:00).

df = config.get_merged_dataframe()
df
img_key filename n_annotations datetime_corrected datetime_original exposure_time pixel_x_dimension pixel_y_dimension astronomical_twilight_start nautical_twilight_start ... maximum_wind_gust_time temperature_9am_degC relative_humidity_9am_pc cloud_amount_9am_oktas wind_direction_9am wind_speed_9am_kph temperature_3pm_degC relative_humidity_3pm_pc wind_direction_3pm wind_speed_3pm_kph
location date
cabramurra 2019-11-14 2019-11_cabramurra/0001/DSCF0001.JPG-1 2019-11_cabramurra/0001/DSCF0001.JPG 0 2019-11-14 18:00:03+10:00 2019-11-14 19:00:03 0.012048 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... 23:04 4.9 95 3.0 W 17 10.1 73 W 22
2019-11-14 2019-11_cabramurra/0001/DSCF0002.JPG-1 2019-11_cabramurra/0001/DSCF0002.JPG 0 2019-11-14 18:10:06+10:00 2019-11-14 19:10:06 0.009174 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... 23:04 4.9 95 3.0 W 17 10.1 73 W 22
2019-11-14 2019-11_cabramurra/0001/DSCF0003.JPG-1 2019-11_cabramurra/0001/DSCF0003.JPG 0 2019-11-14 18:20:09+10:00 2019-11-14 19:20:09 0.012048 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... 23:04 4.9 95 3.0 W 17 10.1 73 W 22
2019-11-14 2019-11_cabramurra/0001/DSCF0004.JPG-1 2019-11_cabramurra/0001/DSCF0004.JPG 0 2019-11-14 18:30:11+10:00 2019-11-14 19:30:11 0.020833 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... 23:04 4.9 95 3.0 W 17 10.1 73 W 22
2019-11-14 2019-11_cabramurra/0001/DSCF0005.JPG-1 2019-11_cabramurra/0001/DSCF0005.JPG 0 2019-11-14 18:40:14+10:00 2019-11-14 19:40:14 0.033333 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... 23:04 4.9 95 3.0 W 17 10.1 73 W 22
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2019-11-26 2019-11_cabramurra/0010/DSCF0860.JPG-1 2019-11_cabramurra/0010/DSCF0860.JPG 0 2019-11-26 05:13:26+10:00 2019-11-26 06:13:26 0.033333 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... 13:34 13.6 55 8.0 NNW 41 3.1 99 WNW 35
2019-11-26 2019-11_cabramurra/0010/DSCF0861.JPG-1 2019-11_cabramurra/0010/DSCF0861.JPG 0 2019-11-26 05:23:29+10:00 2019-11-26 06:23:29 0.033333 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... 13:34 13.6 55 8.0 NNW 41 3.1 99 WNW 35
2019-11-26 2019-11_cabramurra/0010/DSCF0862.JPG-1 2019-11_cabramurra/0010/DSCF0862.JPG 0 2019-11-26 05:33:31+10:00 2019-11-26 06:33:31 0.023810 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... 13:34 13.6 55 8.0 NNW 41 3.1 99 WNW 35
2019-11-26 2019-11_cabramurra/0010/DSCF0863.JPG-1 2019-11_cabramurra/0010/DSCF0863.JPG 0 2019-11-26 05:43:34+10:00 2019-11-26 06:43:34 0.018182 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... 13:34 13.6 55 8.0 NNW 41 3.1 99 WNW 35
2019-11-26 2019-11_cabramurra/0010/DSCF0864.JPG-1 2019-11_cabramurra/0010/DSCF0864.JPG 0 2019-11-26 05:53:37+10:00 2019-11-26 06:53:37 0.013889 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... 13:34 13.6 55 8.0 NNW 41 3.1 99 WNW 35

8640 rows × 36 columns

To get a general overview of the activity levels observed throughout the study period, we pool the data from all cameras and plot the number of annotations per 10-minute interval:

# Setting frame of reference and
# adding a daynumber column to df, for simpler plots
df["daynumber"] = (
    df["datetime_corrected"] - config.timestamp_zero
).dt.total_seconds() / 86400
location_names = [location.name for location in config.place.locations]
fig = plot_activity_levels_summaries(
    df,
    location_names,
    x_column="daynumber",
    bin_width=10 / 1440,   # 10 minutes
    ax_kwargs=dict(
        ylabel="Number of annotations in 10 min bins",
        xlabel="Day Number"
    ),
    c="k",
)
../../_images/activity_analysis_12_0.png

The gaps in the above figure are periods where the cameras were not set to take photos (they were only set to take photos between the hours of 19:00-07:00 AEDT each night).

There seems to be a periodic signal in the data, with more activity in the evening. We can take a closer look at this by pooling the data from all days into a single representative 24-hour period.

df["dayhour"] = (df["daynumber"] - np.floor(df["daynumber"])) * 24.

fig = plot_activity_levels_summaries(
    df,
    location_names,
    x_column="dayhour",
    bin_width=10 / 60,   # 10 minutes
    ax_kwargs=dict(
        ylabel="Number of annotations in 10 min bins",
        xlabel="Time of day (h)"
    ),
    c="k",
)
../../_images/activity_analysis_14_0.png

In the above figure we see a striking increase in activity levels during the hours of 19:20-20:20. This seems to be when the most insects are flying.

Instead of binning this data by absolute time of day, it would be nice to bin it according to the relative time from astronomical events, eg. sunset.

Here we calculate a “within_twilight” column, which is time after sunset, scaled to the duration of twilight. We’ll also calculate a “daylight_hours” column, which will be used later.

df["within_twilight"] = (
    df["datetime_corrected"] - df["sunset"]
) / (
    df["astronomical_twilight_end"] - df["sunset"]
)
df["daylight_hours"] = (df["sunset"] - df["sunrise"]).dt.total_seconds() / 3600

df
img_key filename n_annotations datetime_corrected datetime_original exposure_time pixel_x_dimension pixel_y_dimension astronomical_twilight_start nautical_twilight_start ... wind_direction_9am wind_speed_9am_kph temperature_3pm_degC relative_humidity_3pm_pc wind_direction_3pm wind_speed_3pm_kph daynumber dayhour within_twilight daylight_hours
location date
cabramurra 2019-11-14 2019-11_cabramurra/0001/DSCF0001.JPG-1 2019-11_cabramurra/0001/DSCF0001.JPG 0 2019-11-14 18:00:03+10:00 2019-11-14 19:00:03 0.012048 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... W 17 10.1 73 W 22 317.750035 18.000833 -0.504471 13.978719
2019-11-14 2019-11_cabramurra/0001/DSCF0002.JPG-1 2019-11_cabramurra/0001/DSCF0002.JPG 0 2019-11-14 18:10:06+10:00 2019-11-14 19:10:06 0.009174 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... W 17 10.1 73 W 22 317.757014 18.168333 -0.403632 13.978719
2019-11-14 2019-11_cabramurra/0001/DSCF0003.JPG-1 2019-11_cabramurra/0001/DSCF0003.JPG 0 2019-11-14 18:20:09+10:00 2019-11-14 19:20:09 0.012048 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... W 17 10.1 73 W 22 317.763993 18.335833 -0.302793 13.978719
2019-11-14 2019-11_cabramurra/0001/DSCF0004.JPG-1 2019-11_cabramurra/0001/DSCF0004.JPG 0 2019-11-14 18:30:11+10:00 2019-11-14 19:30:11 0.020833 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... W 17 10.1 73 W 22 317.770961 18.503056 -0.202121 13.978719
2019-11-14 2019-11_cabramurra/0001/DSCF0005.JPG-1 2019-11_cabramurra/0001/DSCF0005.JPG 0 2019-11-14 18:40:14+10:00 2019-11-14 19:40:14 0.033333 4608 3456 2019-11-14 03:12:20.886936+10:00 2019-11-14 03:49:05.935004+10:00 ... W 17 10.1 73 W 22 317.777940 18.670556 -0.101282 13.978719
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2019-11-26 2019-11_cabramurra/0010/DSCF0860.JPG-1 2019-11_cabramurra/0010/DSCF0860.JPG 0 2019-11-26 05:13:26+10:00 2019-11-26 06:13:26 0.033333 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... NNW 41 3.1 99 WNW 35 329.217662 5.223889 -7.947638 14.293727
2019-11-26 2019-11_cabramurra/0010/DSCF0861.JPG-1 2019-11_cabramurra/0010/DSCF0861.JPG 0 2019-11-26 05:23:29+10:00 2019-11-26 06:23:29 0.033333 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... NNW 41 3.1 99 WNW 35 329.224641 5.391389 -7.851293 14.293727
2019-11-26 2019-11_cabramurra/0010/DSCF0862.JPG-1 2019-11_cabramurra/0010/DSCF0862.JPG 0 2019-11-26 05:33:31+10:00 2019-11-26 06:33:31 0.023810 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... NNW 41 3.1 99 WNW 35 329.231609 5.558611 -7.755108 14.293727
2019-11-26 2019-11_cabramurra/0010/DSCF0863.JPG-1 2019-11_cabramurra/0010/DSCF0863.JPG 0 2019-11-26 05:43:34+10:00 2019-11-26 06:43:34 0.018182 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... NNW 41 3.1 99 WNW 35 329.238588 5.726111 -7.658763 14.293727
2019-11-26 2019-11_cabramurra/0010/DSCF0864.JPG-1 2019-11_cabramurra/0010/DSCF0864.JPG 0 2019-11-26 05:53:37+10:00 2019-11-26 06:53:37 0.013889 4608 3456 2019-11-26 03:00:54.332543+10:00 2019-11-26 03:39:54.781647+10:00 ... NNW 41 3.1 99 WNW 35 329.245567 5.893611 -7.562418 14.293727

8640 rows × 40 columns

We can now plot these data. This also plots a separate trace for each location (whereas the above plot only plots one trace for all the data).

fig = plot_activity_levels_summaries(
    df,
    location_names,
    x_column="within_twilight",
    bin_width=0.1,   # 10th of twilight duration
    ax_kwargs=dict(
        title="Plot of number of annotations relative to twilight",
        ylabel="Number of annotations",
        xlabel="Twilight durations since sunset"
    ),
    separate_plots=False,
    c="k",
)
twilight_vspan = fig.axes[0].axvspan(0, 1, alpha=0.3, label="Evening twilight")
legend = fig.axes[0].legend()
../../_images/activity_analysis_18_0.png

The moths appear to be flying during twilight. So lets select only those time points to quantify daily maelstrom intensity. Taking the sum of annotations during this period for each day, we can then look at how activity levels were across the days of the study period.

# We're only interested in the numeric columns
keep_dtypes = set(
    np.dtype("".join(t)) for t in itertools.product("ifu", "248")
)
keep_cols = filter(lambda c: df[c].dtype in keep_dtypes, df.columns)

# But even some of the numeric columns are irrelevant after aggregation
drop_cols = {
    "exposure_time",
    "pixel_x_dimension",
    "pixel_y_dimension",
    "dayhour",
    "within_twilight",
}
keep_cols = filter(lambda c: c not in drop_cols, keep_cols)

# For most of the columns, we just want to take the mean value
aggregation_functions = {column: (column, "mean") for column in keep_cols}

# But for "n_annotations", we want to take the sum
aggregation_functions["n_annotations"] = ("n_annotations", "sum")

# And we want to truncate "daynumber" to an integer
aggregation_functions["daynumber"] = ("daynumber", lambda x: int(x[0]))

# We also want to make a new column with the exposure (number of images
# taken) during each twilight interval.
aggregation_functions["exposures"] = ("n_annotations", "count")

# Now we select the data which was obtained during twilight,
# and group it by location and date, aggregating using the above-defined
# aggregation functions. This leaves us with a DataFrame with one row per
# date for each location in the study.
maelstrom_df = df[
    (df["within_twilight"] <= 1.) & (df["within_twilight"] >= 0)
].groupby(["location", "date"]).aggregate(**aggregation_functions)
maelstrom_df
n_annotations lat lon elevation_m temperature_minimum_degC temperature_minimum_evening_degC temperature_maximum_degC rainfall_mm maximum_wind_gust_speed_kph temperature_9am_degC relative_humidity_9am_pc cloud_amount_9am_oktas wind_speed_9am_kph temperature_3pm_degC relative_humidity_3pm_pc wind_speed_3pm_kph daynumber daylight_hours exposures
location date
cabramurra 2019-11-14 80 -35.9507 148.3972 1513.9 -0.6 4.7 12.3 0.0 41.0 4.9 95.0 3.0 17.0 10.1 73.0 22.0 317 13.978719 90
2019-11-15 42 -35.9507 148.3972 1513.9 4.7 3.7 13.8 0.0 70.0 8.3 73.0 2.0 31.0 12.8 53.0 31.0 318 14.007941 100
2019-11-16 35 -35.9507 148.3972 1513.9 3.7 3.3 14.3 0.0 48.0 5.9 69.0 NaN 11.0 13.2 35.0 20.0 319 14.036682 100
2019-11-17 50 -35.9507 148.3972 1513.9 3.3 4.5 14.3 0.0 33.0 8.4 42.0 0.0 7.0 13.4 26.0 13.0 320 14.064925 100
2019-11-18 79 -35.9507 148.3972 1513.9 4.5 7.5 16.0 0.0 43.0 7.6 45.0 2.0 15.0 14.5 35.0 24.0 321 14.092652 100
2019-11-19 105 -35.9507 148.3972 1513.9 7.5 11.0 21.7 0.0 65.0 13.0 41.0 0.0 30.0 20.6 22.0 30.0 322 14.119846 100
2019-11-20 59 -35.9507 148.3972 1513.9 11.0 15.0 23.1 0.0 31.0 15.0 31.0 1.0 7.0 21.3 23.0 17.0 323 14.146488 100
2019-11-21 117 -35.9507 148.3972 1513.9 15.0 17.2 27.6 0.0 50.0 22.0 27.0 1.0 13.0 27.3 22.0 22.0 324 14.172561 100
2019-11-22 33 -35.9507 148.3972 1513.9 17.2 11.3 22.6 0.0 54.0 19.8 34.0 1.0 15.0 21.7 39.0 26.0 325 14.198045 110
2019-11-23 40 -35.9507 148.3972 1513.9 11.3 8.2 20.0 0.0 43.0 13.9 36.0 0.0 15.0 18.5 28.0 22.0 326 14.222923 110
2019-11-24 43 -35.9507 148.3972 1513.9 8.2 11.8 19.8 0.0 41.0 13.0 60.0 0.0 9.0 18.2 40.0 22.0 327 14.247175 100
2019-11-25 36 -35.9507 148.3972 1513.9 11.8 13.0 21.9 0.0 50.0 13.9 33.0 0.0 20.0 20.4 26.0 28.0 328 14.270782 100

Now we can plot these data:

fig = plt.figure()
ax = fig.add_subplot(
    111,
    title="Moth maelstrom activity at Cabramurra boulder field\nwith daily temperature records",
    ylabel="Number of annotations during maelstrom",
    xlabel="Day number",
)
lines = plot_maelstroms_with_temperature(
    maelstrom_df,
    ax,
    maelstrom_kwargs=dict(
        c="k",
        lw=3,
    ),
)
../../_images/activity_analysis_22_0.png

We then may like to regress the activity levels against various factors. Given the activity level count data, we can proceed using a Poisson regression of n_annotations vs. the independent variables of interest.

First, we will select non-correlated covariates from maelstrom_df. Here we can add derived covariates, such as temperature_range and dewpoint_degC.

We also define a set drop of columns not to include as covariates.

maelstrom_df["temperature_range"] = maelstrom_df.temperature_maximum_degC - maelstrom_df.temperature_minimum_evening_degC
# maelstrom_df["dewpoint_3pm_degC"] = mpcalc.dewpoint_from_relative_humidity(
#     units.Quantity(maelstrom_df["temperature_3pm_degC"].array, "degC"),
#     units.Quantity(maelstrom_df["relative_humidity_3pm_pc"].array, "percent"),
# )

drop = {
    "n_annotations",           # Variable of interest
    "rainfall_mm",             # All zero in this dataset
    "cloud_amount_9am_oktas",  # Hass missing data
    "exposures",               # Exposure variable
    "lat",
    "lon",                     # All locations are the same
    "elevation_m",             # in this example
}
covariates = list(filter(lambda c: c not in drop, maelstrom_df.columns))

We can then plot scatter plots of all of the covariates.

grr = pd.plotting.scatter_matrix(
    maelstrom_df[["n_annotations", *covariates]],
    marker="o",
    figsize=(15, 15),
)
for ax in grr[-1, :]:
    ax.set_xlabel(ax.get_xlabel(), rotation=45, ha="right")
for ax in grr[:, 0]:
    ax.set_ylabel(ax.get_ylabel(), rotation=45, ha="right")
../../_images/activity_analysis_26_0.png

Some of these look like they are correlated, so we should remove them from the list of covariates. We start by finding the pearson correlation between each of the varibles.

Note that the below dataframe has ones on the leading diagonal. Strictly speaking, these should be zeros (but leaving them as ones happens to be more convenient in this case).

correlation_p_vals = maelstrom_df[covariates].corr(
    method=lambda x, y: sp.stats.pearsonr(x, y)[1]
)

n_annotations_corr = maelstrom_df[covariates].corrwith(
    maelstrom_df["n_annotations"]
)

correlation_p_vals
temperature_minimum_degC temperature_minimum_evening_degC temperature_maximum_degC maximum_wind_gust_speed_kph temperature_9am_degC relative_humidity_9am_pc wind_speed_9am_kph temperature_3pm_degC relative_humidity_3pm_pc wind_speed_3pm_kph daynumber daylight_hours temperature_range
temperature_minimum_degC 1.000000 0.002116 7.228477e-05 0.726410 0.000001 0.002227 0.765180 5.734440e-05 0.061840 0.583986 2.849026e-03 2.297827e-03 0.200886
temperature_minimum_evening_degC 0.002116 1.000000 1.477610e-06 0.773894 0.000269 0.009102 0.598260 1.022308e-05 0.059156 0.843996 8.241975e-03 7.022097e-03 0.768792
temperature_maximum_degC 0.000072 0.000001 1.000000e+00 0.886576 0.000003 0.001713 0.766278 1.321696e-11 0.019935 0.710210 6.060147e-03 4.804292e-03 0.518073
maximum_wind_gust_speed_kph 0.726410 0.773894 8.865758e-01 1.000000 0.765853 0.708056 0.000074 7.875770e-01 0.711903 0.000115 7.235197e-01 7.176197e-01 0.134835
temperature_9am_degC 0.000001 0.000269 2.677875e-06 0.765853 1.000000 0.003710 0.752917 1.057270e-06 0.069890 0.712510 9.914545e-03 8.194690e-03 0.306516
relative_humidity_9am_pc 0.002227 0.009102 1.712610e-03 0.708056 0.003710 1.000000 0.518422 1.337258e-03 0.000111 0.789428 1.408504e-02 1.071846e-02 0.290696
wind_speed_9am_kph 0.765180 0.598260 7.662779e-01 0.000074 0.752917 0.518422 1.000000 8.017529e-01 0.494823 0.000090 5.306412e-01 5.172520e-01 0.463215
temperature_3pm_degC 0.000057 0.000010 1.321696e-11 0.787577 0.000001 0.001337 0.801753 1.000000e+00 0.015457 0.700807 9.167460e-03 7.319337e-03 0.405187
relative_humidity_3pm_pc 0.061840 0.059156 1.993542e-02 0.711903 0.069890 0.000111 0.494823 1.545673e-02 1.000000 0.566386 6.486753e-02 5.388663e-02 0.244309
wind_speed_3pm_kph 0.583986 0.843996 7.102099e-01 0.000115 0.712510 0.789428 0.000090 7.008068e-01 0.566386 1.000000 7.355078e-01 7.546994e-01 0.552350
daynumber 0.002849 0.008242 6.060147e-03 0.723520 0.009915 0.014085 0.530641 9.167460e-03 0.064868 0.735508 1.000000e+00 2.993460e-16 0.750526
daylight_hours 0.002298 0.007022 4.804292e-03 0.717620 0.008195 0.010718 0.517252 7.319337e-03 0.053887 0.754699 2.993460e-16 1.000000e+00 0.723849
temperature_range 0.200886 0.768792 5.180730e-01 0.134835 0.306516 0.290696 0.463215 4.051872e-01 0.244309 0.552350 7.505260e-01 7.238494e-01 1.000000

We remove correlated variables in a greedy fashion, recursively selecting the most significantly correlated pair of variables, and removing the one which is not as well correlated with “n_annotations”. We’ll use a stringent p-value cutoff of 0.001 to make sure we don’t throw away any important variables.

def greedy_variate_removal(correlation_p_vals, p_cutoff, scores):
    r, c = divmod(np.argmin(correlation_p_vals), len(correlation_p_vals))

    if correlation_p_vals.iloc[r, c] >= p_cutoff:  # We're done!
        return correlation_p_vals

    drop_label = correlation_p_vals.index[
        [r, c][
            scores[correlation_p_vals.index[[r, c]]].argmin()
        ]
    ]
    correlation_p_vals = correlation_p_vals.drop(index=drop_label)
    correlation_p_vals.drop(columns=drop_label, inplace=True)

    return greedy_variate_removal(correlation_p_vals, p_cutoff, scores)


correlation_p_vals = greedy_variate_removal(
    correlation_p_vals, 0.001, n_annotations_corr
)
filtered_covariates = list(correlation_p_vals.index)
correlation_p_vals
temperature_minimum_degC temperature_minimum_evening_degC relative_humidity_9am_pc wind_speed_9am_kph daylight_hours temperature_range
temperature_minimum_degC 1.000000 0.002116 0.002227 0.765180 0.002298 0.200886
temperature_minimum_evening_degC 0.002116 1.000000 0.009102 0.598260 0.007022 0.768792
relative_humidity_9am_pc 0.002227 0.009102 1.000000 0.518422 0.010718 0.290696
wind_speed_9am_kph 0.765180 0.598260 0.518422 1.000000 0.517252 0.463215
daylight_hours 0.002298 0.007022 0.010718 0.517252 1.000000 0.723849
temperature_range 0.200886 0.768792 0.290696 0.463215 0.723849 1.000000

With this new filtered list of covariates, we again plot the pairs to make sure that everything looks nice and uncorrelated.

grr = pd.plotting.scatter_matrix(
    maelstrom_df[["n_annotations", *filtered_covariates]],
    marker="o",
    figsize=(15, 15),
)
for ax in grr[-1, :]:
    ax.set_xlabel(ax.get_xlabel(), rotation=45, ha="right")
for ax in grr[:, 0]:
    ax.set_ylabel(ax.get_ylabel(), rotation=45, ha="right")
../../_images/activity_analysis_32_0.png

Fitting a Poisson GLM of n_annotations vs. each covariate individually, and plotting the effect:

pois = sm.families.Poisson()

tvalues = []
pvalues = []
for covariate in filtered_covariates:
    mod = smf.glm(
        f"n_annotations ~ {covariate}",
        data=maelstrom_df,
        family=pois,
        exposure=maelstrom_df["exposures"],
    )
    res = mod.fit()
    tvalues.append(res.tvalues[1])
    pvalues.append(res.pvalues[1])

tvalues = np.array(tvalues)
pvalues = np.array(pvalues)

ordering = np.argsort(tvalues)
coloring = np.array(["r", "b"])[(pvalues[ordering] < 0.05).astype("u1")]

fig = plt.figure()
ax = fig.add_subplot(
    111,
    title="Single main effect plot",
    xlabel="Scaled estimate",
    ylabel="Variable",
)
ax.axvline(0, c="gray")
ax.hlines(
    np.array(filtered_covariates)[ordering],
    tvalues[ordering] - 1.96,
    tvalues[ordering] + 1.96,
    color=coloring
)
p = ax.scatter(
    tvalues[ordering],
    np.array(filtered_covariates)[ordering],
    color=coloring
)

significant_single_effect_variables = set(np.array(filtered_covariates)[pvalues < 0.05])
print("Significant single-effect variables:", *significant_single_effect_variables, sep="\n  - ")
Significant single-effect variables:
  - daylight_hours
  - temperature_minimum_evening_degC
  - temperature_range
  - wind_speed_9am_kph
../../_images/activity_analysis_34_1.png

Fitting a Poisson GLM of n_annotations vs. all of the covariates, and plotting the effect:

mod = smf.glm(
    "n_annotations ~ " + " + ".join(filtered_covariates),
    data=maelstrom_df,
    family=pois,
    exposure=maelstrom_df["exposures"],
)
res = mod.fit()
print(res.summary())
print(f"res.aic={res.aic}")

ordering = np.argsort(res.tvalues)
coloring = np.array(["r", "b"])[
    (res.pvalues[ordering] < 0.05).astype("u1")
]

fig = plt.figure()
ax = fig.add_subplot(
    111,
    xlabel="Scaled estimate",
    ylabel="Variable",
)
ax.axvline(0, c="gray")
ax.hlines(
    res.tvalues.index[ordering],
    res.tvalues[ordering] - 1.96,
    res.tvalues[ordering] + 1.96,
    color=coloring,
)
p = ax.scatter(res.tvalues[ordering], res.tvalues.index[ordering], color=coloring)

significant_mixed_effect_variables = set(res.tvalues.index[res.pvalues < 0.05])

print("\nSignificant mixed-effect variables:", *significant_mixed_effect_variables, sep="\n  - ")
                 Generalized Linear Model Regression Results
==============================================================================
Dep. Variable:          n_annotations   No. Observations:                   12
Model:                            GLM   Df Residuals:                        5
Model Family:                 Poisson   Df Model:                            6
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -47.274
Date:                Mon, 09 Aug 2021   Deviance:                       24.473
Time:                        14:47:53   Pearson chi2:                     24.2
No. Iterations:                     4
Covariance Type:            nonrobust
====================================================================================================
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercept                           41.3617     11.516      3.592      0.000      18.790      63.933
temperature_minimum_degC            -0.1297      0.025     -5.230      0.000      -0.178      -0.081
temperature_minimum_evening_degC     0.1897      0.022      8.543      0.000       0.146       0.233
relative_humidity_9am_pc            -0.0017      0.004     -0.468      0.640      -0.009       0.005
wind_speed_9am_kph                   0.0062      0.006      1.120      0.263      -0.005       0.017
daylight_hours                      -3.1266      0.806     -3.881      0.000      -4.705      -1.548
temperature_range                    0.1548      0.052      3.000      0.003       0.054       0.256
====================================================================================================
res.aic=108.54753196329011

Significant mixed-effect variables:
  - temperature_minimum_degC
  - daylight_hours
  - temperature_minimum_evening_degC
  - Intercept
  - temperature_range
../../_images/activity_analysis_36_1.png

From the above plot, we can see that some of the variables do not have a significant effect on “n_annotations”. So let’s remove them and see if we get a better model.

sig_eff_variables = list(significant_mixed_effect_variables - {"Intercept"})

mod = smf.glm(
    "n_annotations ~ " + " + ".join(sig_eff_variables),
    data=maelstrom_df,
    family=pois,
    exposure=maelstrom_df["exposures"],
)
res = mod.fit()
print(res.summary())
print(f"res.aic={res.aic}")

ordering = np.argsort(res.tvalues)
coloring = np.array(["r", "b"])[
    (res.pvalues[ordering] < 0.05).astype("u1")
]

fig = plt.figure()
ax = fig.add_subplot(
    111,
    xlabel="Scaled estimate",
    ylabel="Variable",
)
ax.axvline(0, c="gray")
ax.hlines(
    res.tvalues.index[ordering],
    res.tvalues[ordering] - 1.96,
    res.tvalues[ordering] + 1.96,
    color=coloring,
)
p = ax.scatter(res.tvalues[ordering], res.tvalues.index[ordering], color=coloring)
                 Generalized Linear Model Regression Results
==============================================================================
Dep. Variable:          n_annotations   No. Observations:                   12
Model:                            GLM   Df Residuals:                        7
Model Family:                 Poisson   Df Model:                            4
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -47.915
Date:                Mon, 09 Aug 2021   Deviance:                       25.756
Time:                        14:47:53   Pearson chi2:                     25.3
No. Iterations:                     4
Covariance Type:            nonrobust
====================================================================================================
                                       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercept                           39.6614     10.961      3.618      0.000      18.178      61.144
temperature_minimum_degC            -0.1381      0.024     -5.790      0.000      -0.185      -0.091
temperature_minimum_evening_degC     0.1997      0.020     10.108      0.000       0.161       0.238
daylight_hours                      -3.0297      0.775     -3.909      0.000      -4.549      -1.511
temperature_range                    0.1882      0.041      4.562      0.000       0.107       0.269
====================================================================================================
res.aic=105.83037074639384
../../_images/activity_analysis_38_1.png

Removing the variables from the model lowered AIC, meaning that not much information was lost, while reducing the risk of overfitting. We will proceed with this model.

To check that nothing strange is going on, we can plot the residuals:

fig = plt.figure()
ax = fig.add_subplot(
    111,
    title="Residual plot for:\n" + "\n".join(textwrap.wrap(mod.formula, width=55)),
    ylabel="Pearson residual",
    xlabel="Fitted values (n_annotations)",
)

ax.axhline(0, c="gray")
p = ax.scatter(res.fittedvalues, res.resid_pearson, marker="x", c="k")
../../_images/activity_analysis_40_0.png

Finally, we can combine the summary abundance/activity plots from above, this time including the predicted values for “n_annotations” (and confidence interval) into the maelstrom plot.

prediction = res.get_prediction()

# Set up figure
fig = plt.figure(
    figsize=(7.5, 5.2),
    tight_layout=True,
)
title_y = 0.87
title_fontdict = {"fontweight": "bold"}

# Define each subplot
# General summary plot
ax1 = fig.add_subplot(
    221,
    xlabel="Day Number",
    ylabel="Number of annotations",
)
ax1.set_title(
    " (a)", fontdict=title_fontdict, loc="left", y=title_y
)
# Twilight plot
ax2 = fig.add_subplot(
    222,
    xlabel="Twilight durations since sunset",
)
ax2.set_title(
    " (b)", fontdict=title_fontdict, loc="left", y=title_y
)
# Maelstrom plot with temperatures
ax3 = fig.add_subplot(
    212,
    xlabel="Day Number",
    ylabel="Number of annotations\nDuring twilight",
)
ax3.set_title(
    " (c)", fontdict=title_fontdict, loc="left", y=title_y
)

# Actual plotting
# General summary plot
plot_activity_levels_summary(
    df,
    ax1,
    x_column="daynumber",
    bin_width=10 / 1440,   # 10 minutes
    c="k",
    lw=0.75,
)

# Twilight plot
plot_activity_levels_summary(
    df,
    ax2,
    x_column="within_twilight",
    bin_width=0.1,   # 10th of twilight duration
    c="k",
    lw=0.75,
)
twilight_vspan = ax2.axvspan(
    0, 1, alpha=0.3, label="Evening twilight"
)

# Maelstrom plot with temperatures
plot_maelstroms_with_temperature(
    maelstrom_df,
    ax3,
    maelstrom_kwargs=dict(
        c="k",
        lw=3,
    ),
)

# Model prediction
ax3.plot(
    maelstrom_df["daynumber"],
    prediction.predicted_mean * maelstrom_df["exposures"],
    color="g",
    alpha=0.7,
    linestyle="dotted",
)
conf_int = prediction.conf_int()
ax3.fill_between(
    maelstrom_df["daynumber"],
    conf_int[:, 0] * maelstrom_df["exposures"],
    conf_int[:, 1] * maelstrom_df["exposures"],
    color="g",
    alpha=0.25,
    linestyle="dotted",
)

# Force x-axis ticks to be integers to make it prettier
for ax in fig.axes:
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
../../_images/activity_analysis_42_0.png
fig.savefig("activity_levels_figure.pdf", dpi=600.0, pad_inches=0.0)