This tutorial explains how to create time series features with tsfresh using the Beijing Multi-Site Air-Quality Data downloaded from the UCI Machine Learning Repository.

Packages

The documentation for each package used in this tutorial is linked below:

Open up a new Jupyter notebook and import the following:


import pandas as pd
import tsfresh
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

Create initial dataset

The zipfile is downloaded from UCI Machine Learning Repository using urllib and unzipped with zipfile. This zipfile contains one csv for each reporting station. Read each of these csv files and append to the pandas dataframe.


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00501/PRSA2017_Data_20130301-20170228.zip"
r = urlopen(url)
zf = ZipFile(BytesIO(r.read()))

df = pd.DataFrame()
for file in zf.infolist():
    if file.filename.endswith('.csv'):
        df = df.append(pd.read_csv(zf.open(file)))

df['timestamp'] = pd.to_datetime(df[["year", "month", "day", "hour"]])
df.drop(columns=['No'], inplace=True)
df.sort_values(by=['timestamp', 'station']).head(10)

	year	month	day	hour	PM2.5	PM10	SO2	NO2	CO	O3	TEMP	PRES	DEWP	RAIN	wd	WSPM	station	timestamp
0	2013	3	1	0	4.0	4.0	4.0	7.0	300.0	77.0	-0.7	1023.0	-18.8	0.0	NNW	4.4	Aotizhongxin	2013-03-01
0	2013	3	1	0	3.0	6.0	13.0	7.0	300.0	85.0	-2.3	1020.8	-19.7	0.0	E	0.5	Changping	2013-03-01
0	2013	3	1	0	4.0	4.0	3.0	NaN	200.0	82.0	-2.3	1020.8	-19.7	0.0	E	0.5	Dingling	2013-03-01
0	2013	3	1	0	9.0	9.0	3.0	17.0	300.0	89.0	-0.5	1024.5	-21.4	0.0	NNW	5.7	Dongsi	2013-03-01
0	2013	3	1	0	4.0	4.0	14.0	20.0	300.0	69.0	-0.7	1023.0	-18.8	0.0	NNW	4.4	Guanyuan	2013-03-01
0	2013	3	1	0	6.0	18.0	5.0	NaN	800.0	88.0	0.1	1021.1	-18.6	0.0	NW	4.4	Gucheng	2013-03-01
0	2013	3	1	0	7.0	7.0	3.0	2.0	100.0	91.0	-2.3	1020.3	-20.7	0.0	WNW	3.1	Huairou	2013-03-01
0	2013	3	1	0	5.0	14.0	4.0	12.0	200.0	85.0	-0.5	1024.5	-21.4	0.0	NNW	5.7	Nongzhanguan	2013-03-01
0	2013	3	1	0	3.0	6.0	3.0	8.0	300.0	44.0	-0.9	1025.8	-20.5	0.0	NW	9.3	Shunyi	2013-03-01
0	2013	3	1	0	6.0	6.0	4.0	8.0	300.0	81.0	-0.5	1024.5	-21.4	0.0	NNW	5.7	Tiantan	2013-03-01


tsfresh doesn't handle missing value well, so check for missing values.


df.isnull().sum()

df_features = tsfresh.extract_features(ts_df, column_id='station', column_sort='timestamp', 
                                       default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters())
df_features.columns

You should see this in your output:


Index(['PM2.5__sum_values', 'PM2.5__median', 'PM2.5__mean', 'PM2.5__length',
       'PM2.5__standard_deviation', 'PM2.5__variance',
       'PM2.5__root_mean_square', 'PM2.5__maximum', 'PM2.5__minimum',
       'PM10__sum_values', 'PM10__median', 'PM10__mean', 'PM10__length',
       'PM10__standard_deviation', 'PM10__variance', 'PM10__root_mean_square',
       'PM10__maximum', 'PM10__minimum', 'DEWP__sum_values', 'DEWP__median',
       'DEWP__mean', 'DEWP__length', 'DEWP__standard_deviation',
       'DEWP__variance', 'DEWP__root_mean_square', 'DEWP__maximum',
       'DEWP__minimum', 'RAIN__sum_values', 'RAIN__median', 'RAIN__mean',
       'RAIN__length', 'RAIN__standard_deviation', 'RAIN__variance',
       'RAIN__root_mean_square', 'RAIN__maximum', 'RAIN__minimum',
       'SO2__sum_values', 'SO2__median', 'SO2__mean', 'SO2__length',
       'SO2__standard_deviation', 'SO2__variance', 'SO2__root_mean_square',
       'SO2__maximum', 'SO2__minimum', 'NO2__sum_values', 'NO2__median',
       'NO2__mean', 'NO2__length', 'NO2__standard_deviation', 'NO2__variance',
       'NO2__root_mean_square', 'NO2__maximum', 'NO2__minimum',
       'CO__sum_values', 'CO__median', 'CO__mean', 'CO__length',
       'CO__standard_deviation', 'CO__variance', 'CO__root_mean_square',
       'CO__maximum', 'CO__minimum', 'O3__sum_values', 'O3__median',
       'O3__mean', 'O3__length', 'O3__standard_deviation', 'O3__variance',
       'O3__root_mean_square', 'O3__maximum', 'O3__minimum',
       'WSPM__sum_values', 'WSPM__median', 'WSPM__mean', 'WSPM__length',
       'WSPM__standard_deviation', 'WSPM__variance', 'WSPM__root_mean_square',
       'WSPM__maximum', 'WSPM__minimum', 'TEMP__sum_values', 'TEMP__median',
       'TEMP__mean', 'TEMP__length', 'TEMP__standard_deviation',
       'TEMP__variance', 'TEMP__root_mean_square', 'TEMP__maximum',
       'TEMP__minimum', 'PRES__sum_values', 'PRES__median', 'PRES__mean',
       'PRES__length', 'PRES__standard_deviation', 'PRES__variance',
       'PRES__root_mean_square', 'PRES__maximum', 'PRES__minimum'],
      dtype='object')
      

A dictionary of features and settings can also be created to control the features created. Below is a example:


fc_settings = {'variance_larger_than_standard_deviation': None,
 'has_duplicate_max': None,
 'has_duplicate_min': None,
 'has_duplicate': None,
 'sum_values': None,
 'abs_energy': None,
 'mean_abs_change': None,
 'mean_change': None,
 'mean_second_derivative_central': None,
 'median': None,
 'mean': None,
 'length': None,
 'standard_deviation': None,
 'variation_coefficient': None,
 'variance': None,
 'skewness': None,
 'kurtosis': None,
 'root_mean_square': None,
 'absolute_sum_of_changes': None,
 'longest_strike_below_mean': None,
 'longest_strike_above_mean': None,
 'count_above_mean': None,
 'count_below_mean': None,
 'last_location_of_maximum': None,
 'first_location_of_maximum': None,
 'last_location_of_minimum': None,
 'first_location_of_minimum': None,
 'percentage_of_reoccurring_values_to_all_values': None,
 'percentage_of_reoccurring_datapoints_to_all_datapoints': None,
 'sum_of_reoccurring_values': None,
 'sum_of_reoccurring_data_points': None,
 'ratio_value_number_to_time_series_length': None,
 'maximum': None,
 'minimum': None,
 'benford_correlation': None,
 'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
 'cid_ce': [{'normalize': True}, {'normalize': False}],
 'symmetry_looking': [{'r': 0.0},
   {'r': 0.1},
  {'r': 0.2},
  {'r': 0.30000000000000004},
  {'r': 0.4},
  {'r': 0.5}],
 'large_standard_deviation': [{'r': 0.5},
  {'r': 0.75},
  {'r': 0.9500000000000001}],
 'quantile': [{'q': 0.1},
  {'q': 0.2},
  {'q': 0.3},
  {'q': 0.4},
  {'q': 0.6},
  {'q': 0.7},
  {'q': 0.8},
  {'q': 0.9}],
 'autocorrelation': [{'lag': 0},
  {'lag': 1},
  {'lag': 2},
  {'lag': 3},
  {'lag': 4},
  {'lag': 5},
  {'lag': 6},
  {'lag': 7},
  {'lag': 8},
  {'lag': 9}],
 'agg_autocorrelation': [{'f_agg': 'mean', 'maxlag': 40},
  {'f_agg': 'median', 'maxlag': 40},
  {'f_agg': 'var', 'maxlag': 40}],
 'partial_autocorrelation': [{'lag': 0},
  {'lag': 1},
  {'lag': 2},
  {'lag': 3},
  {'lag': 4},
  {'lag': 5},
  {'lag': 6},
  {'lag': 7},
  {'lag': 8},
  {'lag': 9}],
 'number_cwt_peaks': [{'n': 1}, {'n': 5}],
 'number_peaks': [{'n': 1}, {'n': 3}, {'n': 5}, {'n': 10}, {'n': 50}],
 'binned_entropy': [{'max_bins': 10}],
 'index_mass_quantile': [{'q': 0.1},
  {'q': 0.2},
  {'q': 0.3},
  {'q': 0.4},
  {'q': 0.6},
  {'q': 0.7},
  {'q': 0.8},
  {'q': 0.9}],
 'spkt_welch_density': [{'coeff': 2}, {'coeff': 5}, {'coeff': 8}],
 'ar_coefficient': [{'coeff': 0, 'k': 10},
  {'coeff': 1, 'k': 10},
  {'coeff': 2, 'k': 10},
  {'coeff': 3, 'k': 10},
  {'coeff': 4, 'k': 10},
  {'coeff': 5, 'k': 10},
  {'coeff': 6, 'k': 10},
  {'coeff': 7, 'k': 10},
  {'coeff': 8, 'k': 10},
  {'coeff': 9, 'k': 10},
  {'coeff': 10, 'k': 10}],
 'value_count': [{'value': 0}, {'value': 1}, {'value': -1}],
 'range_count': [{'min': -1, 'max': 1}],
 'linear_trend': [{'attr': 'pvalue'},
  {'attr': 'rvalue'},
  {'attr': 'intercept'},
  {'attr': 'slope'},
  {'attr': 'stderr'}],
 'augmented_dickey_fuller': [{'attr': 'teststat'},
  {'attr': 'pvalue'},
  {'attr': 'usedlag'}],
 'number_crossing_m': [{'m': 0}, {'m': -1}, {'m': 1}],
 'energy_ratio_by_chunks': [{'num_segments': 10, 'segment_focus': 0},
  {'num_segments': 10, 'segment_focus': 1},
  {'num_segments': 10, 'segment_focus': 2},
  {'num_segments': 10, 'segment_focus': 3},
  {'num_segments': 10, 'segment_focus': 4},
  {'num_segments': 10, 'segment_focus': 5},
  {'num_segments': 10, 'segment_focus': 6},
  {'num_segments': 10, 'segment_focus': 7},
  {'num_segments': 10, 'segment_focus': 8},
  {'num_segments': 10, 'segment_focus': 9}],
 'ratio_beyond_r_sigma': [{'r': 0.5},
  {'r': 1},
  {'r': 1.5},
  {'r': 2},
  {'r': 2.5},
  {'r': 3},
  {'r': 5},
  {'r': 6},
  {'r': 7},
  {'r': 10}],
 'linear_trend_timewise': [{'attr': 'pvalue'},
  {'attr': 'rvalue'},
  {'attr': 'intercept'},
  {'attr': 'slope'},
  {'attr': 'stderr'}],
 'count_above': [{'t': 0}],
 'count_below': [{'t': 0}],
 'permutation_entropy': [{'tau': 1, 'dimension': 3},
  {'tau': 1, 'dimension': 4},
  {'tau': 1, 'dimension': 5},
  {'tau': 1, 'dimension': 6},
  {'tau': 1, 'dimension': 7}],
 'query_similarity_count': [{'query': None, 'threshold': 0.0}]}
 

df_features = tsfresh.extract_features(ts_df, column_id='station', column_sort='timestamp', 
                                       default_fc_parameters=fc_settings)
df_features.columns

Time-series forecasting use case

The above method rolls all time series data up into a single record per column_id (station in this case). For time series, this summarization often needs to be done at each timestamp and summarize the data from prior to the current timestamp. roll_time_series creates a dataframe that allows tsfresh to calculate the features at each timestamp correctly. We control the maximum window of the data with the parameter max_timeshift.


df_rolled = tsfresh.utilities.dataframe_functions.roll_time_series(df2014,
                                                                   column_id='station',
                                                                   column_sort='timestamp',
                                                                   min_timeshift=24,
                                                                   max_timeshift=24)
df_rolled.drop(columns=['year', 'month', 'day', 'hour', 'wd', 'station'], inplace=True)

Now that the rolled dataframe has been created, extract_features can be run just as was done before


df_features = tsfresh.extract_features(df_rolled, column_id='id', column_sort='timestamp', 
                                       default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters())
df_features.columns

df_features = df_features.reset_index().rename(columns={'level_0': 'station', 'level_1': 'timestamp'})
df_features.head()