3장 시계열 데이터를 위한 탐색적 데이터 분석
A unique python library that extends the python programming language and provides utilities that enhance productivity.
head(EuStockMarkets)
plot(EuStockMarkets)
class(EuStockMarkets)
frequency(EuStockMarkets)
start(EuStockMarkets)
end(EuStockMarkets)
window(EuStockMarkets, start = 1997, end = 1998)
hist( EuStockMarkets[, "SMI"], 30)
hist(diff(EuStockMarkets[, "SMI"], 30))
plot( EuStockMarkets[, "SMI"], EuStockMarkets[, "DAX"])
plot(diff(EuStockMarkets[, "SMI"]), diff(EuStockMarkets[, "DAX"]))
plot(lag(diff(EuStockMarkets[, "SMI"]), 1),
diff(EuStockMarkets[, "DAX"]))
## rnorm 함수로 정규분포를 따르는 난수를 100개 추출합니다
x <- rnorm(n=100, mean=0, sd=10) + 1:100
## rep 함수로 1/n 값을 n번 반복하는 배열을 만드는 함수를 만듭니다
mn <- function(n) rep(1/n, n)
plot(x, type = 'l', lwd = 1)
## 기본 R의 filter 함수로 롤링 평균을 계산합니다. 각각 5개, 50개 단위로 롤링 합니다
lines(filter(x, mn( 5)), col = 2, lwd = 3, lty = 2)
lines(filter(x, mn(50)), col = 3, lwd = 3, lty = 3)
## 기능을 좀 더 '사용자 정의'하여 사용할 수도 있습니다.
install.packages("zoo")
require(zoo)
## x를 zoo 객체로 만들어 각 데이터를 인덱싱 해 줍니다
## rollapply 함수는 데이터, 윈도우크기, 적용함수, 롤링적용 정렬 방향,
## 윈도우크기만큼 데이터가 없어도 적용할 것인가? 등의 인자 값을 지정합니다
f1 <- rollapply(zoo(x), 20, function(w) min(w),
align = "left", partial = TRUE)
f2 <- rollapply(zoo(x), 20, function(w) min(w),
align = "right", partial = TRUE)
plot(x, lwd=1, type='l')
lines(f1, col=2, lwd=3, lty=2)
lines(f2, col=3, lwd=3, lty=3)
# 확장 윈도우
plot(x, type = 'l', lwd = 1)
lines(cummax(x), col = 2, lwd = 3, lty = 2) # 최대값
lines(cumsum(x)/1:length(x), col = 3, lwd = 3, lty = 3) # 평균
plot(x, type = 'l', lwd = 1)
lines(rollapply(zoo(x), seq_along(x), function(w) max(w),
partial = TRUE, align = "right"),
col=2,lwd=3,lty=2)
lines(rollapply(zoo(x), seq_along(x), function(w) mean(w),
partial = TRUE, align = "right"),
col=2,lwd=3,lty=3)
x<-1:100
y<-sin(x * pi /3)
plot(y, type = "b")
acf(y)
install.packages("data.table")
library(data.table)
## cor 함수는 상관계수를 계산하는 용도로 사용됩니다
## 첫 번째와 두 번째 파라미터가 비교 대상 둘에 대한 것입니다
## use 파라미터는 누락된 값 처리 방법으로, pairwise.complete.obs는
## 계산 대상 변수만을 대상으로 누락된 값을 제거합니다
## y와 y로부터 시차 1과 2만큼 움직인 것과의 상관계수를 계산합니다
cor(y, shift(y, 1), use = "pairwise.complete.obs")
cor(y, shift(y, 2), use = "pairwise.complete.obs")
y<-sin(x * pi /3)
plot(y[1:30], type = "b")
pacf(y)
y1<-sin(x * pi /3)
plot(y1, type = "b")
acf (y1)
pacf(y1)
y2<-sin(x * pi /10)
plot(y2, type = "b")
acf (y2)
pacf(y2)
y <- y1 + y2
plot(y, type = "b")
acf (y)
pacf(y)
## R
noise1 <- rnorm(100, sd = 0.05)
noise2 <- rnorm(100, sd = 0.05)
y1 <- y1 + noise1
y2 <- y2 + noise2
y <- y1 + y2
plot(y1, type = 'b')
acf (y1)
pacf(y1)
plot(y2, type = 'b')
acf (y2)
pacf(y2)
plot(y, type = 'b')
acf (y)
pacf(y)
## R
x <- 1:100
plot(x)
acf (x)
pacf(x)
install.packages("timevis")
require(timevis)
donations <- fread("donations.csv")
d <- donations[, .(min(timestamp), max(timestamp)), user]
names(d) <- c("content", "start", "end")
d <- d[start != end]
timevis(d[sample(1:nrow(d), 20)])
t(matrix(AirPassengers, nrow = 12, ncol = 12))
## R
colors<-c("green","red", "pink", "blue",
"yellow","lightsalmon", "black", "gray",
"cyan", "lightblue", "maroon", "purple")
matplot(matrix(AirPassengers, nrow = 12, ncol = 12),
type = 'l', col = colors, lty = 1, lwd = 2.5,
xaxt = "n", ylab = "Passenger Count")
legend("topleft", legend = 1949:1960, lty = 1, lwd = 2.5,
col = colors)
axis(1, at = 1:12, labels = c("Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec"))
install.packages("forecast")
require(forecast)
seasonplot(AirPassengers)
## R
months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
matplot(t(matrix(AirPassengers, nrow = 12, ncol = 12)),
type='l',col=colors,lty=1,lwd=2.5)
legend("left", legend = months,
col=colors,lty=1,lwd=2.5)
monthplot(AirPassengers)
hist2d <- function(data, nbins.y, xlabels) {
## we make ybins evenly spaced to include
## minimum and maximum points
ymin=min(data)
ymax=max(data) * 1.0001
## the lazy way out to avoid worrying about inclusion/exclusion
ybins=seq(from=ymin,to=ymax,length.out=nbins.y+1)
## make a zero matrix of the appropriate size
hist.matrix=matrix(0,nrow=nbins.y,ncol=ncol(data))
## data comes in matrix form where each row
## represents one data point
for(i in 1:nrow(data)) {
ts = findInterval(data[i, ], ybins)
for (j in 1:ncol(data)) {
hist.matrix[ts[j], j] = hist.matrix[ts[j], j] + 1 hist.matrix
}
}
hist.matrix
}
h <- hist2d(t(matrix(AirPassengers, nrow = 12, ncol = 12)), 5, months)
image(1:ncol(h), 1:nrow(h), t(h), col = heat.colors(5), axes = FALSE, xlab = "Time", ylab = "Passenger Count")
require(data.table)
words <- fread(url.str)
w1 <- words[V1 == 1]
h = hist2d(w1, 25, 1:ncol(w1))
colors <- gray.colors(20, start = 1, end = .5)
par(mfrow = c(1, 2))
image(1:ncol(h), 1:nrow(h), t(h),
col = colors, axes = FALSE, xlab = "Time", ylab = "Projection Value")
image(1:ncol(h), 1:nrow(h), t(log(h)),
col = colors, axes = FALSE, xlab = "Time", ylab = "Projection Value")
## R
w1 <- words[V1 == 1]
## melt the data to the pairs of paired-coordinates
## expected by most 2d histogram implementations
names(w1) <- c("type", 1:270)
w1 <- melt(w1, id.vars = "type")
w1 <- w1[, -1]
names(w1) <- c("Time point", "Value")
plot(hexbin(w1))
devtools::install_github('IRkernel/repr')
install.packages(c('repr', 'IRdisplay', 'pbdZMQ', 'devtools'))
# install.packages("plotly")
require(plotly)
require(data.table)
months = 1:12
ap = data.table(matrix(AirPassengers, nrow = 12, ncol = 12))
names(ap) = as.character(1949:1960)
ap[, month := months]
ap = melt(ap, id.vars = 'month')
names(ap) = c("month", "year", "count")
p <- plot_ly(ap, x = ~month, y = ~year, z = ~count,
color = ~as.factor(month)) %>%
add_markers()%>%
layout(scene=list(xaxis = list(title = 'Month'),
yaxis = list(title = 'Year'),
zaxis = list(title = 'PassengerCount')))
embed_notebook(p)
file.location <- 'https://raw.githubusercontent.com/plotly/datasets/master/_3d-line-plot.csv'
data <- read.csv(file.location)
p <- plot_ly(data,x=~x1,y=~y1,z=~z1,
type = 'scatter3d', mode = 'lines',
line = list(color = '#1f77b4', width = 1))
embed_notebook(p)