Simulate a noisy time series

Let’s say we have a crazy noisy time series where we have perhaps oversampled and encounter spurious variability. Simulate some noisy data. This will represent 1000 observations sampled from a random normal distrubution with a mean=20 and an sd=2. Here it is.

noisy_dat <- data.frame(noisy = rnorm(1000, 20, 2))
str(noisy_dat)

## 'data.frame':    1000 obs. of  1 variable:
##  $ noisy: num  21.6 19.7 21.2 19.5 20.7 ...

noisy_dat <- noisy_dat %>%
    mutate(noisy_seq = seq_along(noisy))
head(noisy_dat, n = 15)

noisy	noisy_seq
21.59	1
19.74	2
21.17	3
19.49	4
20.71	5
18.22	6
18.01	7
21.08	8
20.04	9
20.18	10
22.52	11
23.58	12
18.16	13
20.00	14
20.75	15

p1 <- ggplot(noisy_dat, aes(x = noisy_seq, y = noisy)) + geom_line(color = "red",
    alpha = 0.7) + ylim(c(15, 25)) + jamie.theme
print(p1)

Smoothing: simple moving average

Rolling means represent a simple moving average for any window size (k) you specify. That is, every n’th observation is averaged with the k-past observations to derive a rolling mean that replaces the original observation. This smooths the original data series, diminishing the unique contribution to single observations in favor of the collective. Lots of caveats for using this approach.

# load `zoo` package
library(zoo)
smoother_dat <- noisy_dat %>%
    mutate(smoothdat = zoo::rollmean(noisy, k = 5, fill = NA))  #use a moving window of 5
head(smoother_dat, n = 60)

noisy	noisy_seq	smoothdat
21.59	1	NA
19.74	2	NA
21.17	3	20.54
19.49	4	19.87
20.71	5	19.52
18.22	6	19.50
18.01	7	19.61
21.08	8	19.50
20.04	9	20.36
20.18	10	21.48
22.52	11	20.89
23.58	12	20.89
18.16	13	21.00
20.00	14	21.24
20.75	15	20.28
23.70	16	20.46
18.80	17	19.38
19.05	18	19.79
14.59	19	18.93
22.81	20	18.61
19.39	21	18.46
17.24	22	19.17
18.27	23	18.26
18.13	24	17.56
18.28	25	18.17
15.89	26	18.21
20.31	27	18.57
18.42	28	19.70
19.97	29	20.64
23.91	30	21.11
20.62	31	21.48
22.61	32	21.94
20.30	33	20.49
22.29	34	20.43
16.64	35	19.86
20.34	36	19.76
19.75	37	18.91
19.79	38	19.02
18.06	39	19.27
17.16	40	19.08
21.60	41	19.41
18.79	42	19.14
21.43	43	19.23
16.70	44	18.76
17.61	45	19.31
19.26	46	19.12
21.54	47	19.68
20.49	48	20.36
19.51	49	21.13
20.99	50	21.56
23.14	51	21.58
23.65	52	21.97
20.62	53	21.59
21.43	54	21.20
19.12	55	19.91
21.19	56	18.67
17.19	57	18.29
14.40	58	18.74
19.53	59	18.65
21.40	60	19.61

ggplot(smoother_dat, aes(x = noisy_seq, y = smoothdat)) + geom_line(color = "goldenrod2",
    alpha = 0.7) + ylim(c(15, 25)) + jamie.theme

Simulate random missingness

Create some random holes in this time series by replacing 50 observations with NAs.

set.seed(998877)

# Assuming `column_name` is the name of the column you want to modify
noisy_missing_dat <- noisy_dat %>%
    mutate(noisy_missing = noisy) %>%
    mutate(noisy_missing = ifelse(row_number() %in% sample(n(), 100), NA, noisy_missing))

# View the resulting dataframe
head(noisy_missing_dat, n = 30)

noisy	noisy_seq	noisy_missing
21.59	1	21.59
19.74	2	19.74
21.17	3	21.17
19.49	4	19.49
20.71	5	20.71
18.22	6	18.22
18.01	7	18.01
21.08	8	21.08
20.04	9	20.04
20.18	10	20.18
22.52	11	22.52
23.58	12	23.58
18.16	13	NA
20.00	14	20.00
20.75	15	20.75
23.70	16	23.70
18.80	17	18.80
19.05	18	19.05
14.59	19	14.59
22.81	20	22.81
19.39	21	19.39
17.24	22	17.24
18.27	23	18.27
18.13	24	18.13
18.28	25	18.28
15.89	26	15.89
20.31	27	20.31
18.42	28	18.42
19.97	29	19.97
23.91	30	23.91

Interpolate across missing values

You can’t plot a time series that has empty observations. You’ll need to figure out an imputation method. There are lots of them, but here I’ll just use linear interpolation. We will run this on noisy_missing to fill the missing values interpolating across endpoints. There are much fancier ways to do this, but we will go simple for now.

noisy_missing_dat$noisy_interpolated <- zoo::na.approx(noisy_missing_dat$noisy, na.rm = FALSE)
head(noisy_missing_dat, n = 20)

noisy	noisy_seq	noisy_missing	noisy_interpolated
21.59	1	21.59	21.59
19.74	2	19.74	19.74
21.17	3	21.17	21.17
19.49	4	19.49	19.49
20.71	5	20.71	20.71
18.22	6	18.22	18.22
18.01	7	18.01	18.01
21.08	8	21.08	21.08
20.04	9	20.04	20.04
20.18	10	20.18	20.18
22.52	11	22.52	22.52
23.58	12	23.58	23.58
18.16	13	NA	18.16
20.00	14	20.00	20.00
20.75	15	20.75	20.75
23.70	16	23.70	23.70
18.80	17	18.80	18.80
19.05	18	19.05	19.05
14.59	19	14.59	14.59
22.81	20	22.81	22.81

Simple moving average on interpolated time series

Putting it together. Compute a simple moving average with window of 8 observations on the interpolated time series.

# fill the first few observations that go missing when they don't have a
# precedent (e.g., first four observations in a 5-item window)
smoothed_interp_dat <- noisy_missing_dat %>%
    mutate(smoothed_interpolated_dat = zoo::rollmean(noisy_interpolated, k = 8, fill = "extend"))
str(smoothed_interp_dat)

## 'data.frame':    1000 obs. of  5 variables:
##  $ noisy                    : num  21.6 19.7 21.2 19.5 20.7 ...
##  $ noisy_seq                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ noisy_missing            : num  21.6 19.7 21.2 19.5 20.7 ...
##  $ noisy_interpolated       : num  21.6 19.7 21.2 19.5 20.7 ...
##  $ smoothed_interpolated_dat: num  20 20 20 20 19.8 ...

ggplot(smoothed_interp_dat, aes(x = noisy_seq, y = smoothed_interpolated_dat)) +
    geom_line(color = "tan2", alpha = 0.7) + ylim(c(15, 25)) + jamie.theme

Or plot a smoothed fn

loess smoothed curve

ggplot(smoothed_interp_dat, aes(x = noisy_seq, y = smoothed_interpolated_dat)) +
    geom_smooth() + jamie.theme

Smoothing, Rolling Means, Linear Interpolation

Jamie Reilly, Ph.D.