Chapter 5 stringr and lubridate

2017-05-24 조용순 전공의 강의

11주차 강의 자료입니다.

#"Stringr"
#R 표준 base 패키지에 포함된 함수군와 비슷한 기능을 하는 것으로 보이지만 더 합리적인 출력형식을 가지므로 사용하기 편리함
#패키지의 특징
#1) factor와 character를 같은 방식으로 처리
#2) 일관성 있는 함수 이름과 인수
#3) 다른 함수의 입력값으로 사용하기 편리한 출력값.
#  -입력값 NA가 포함되어 있을 때는 그 부분의 결과를 NA로 돌려줌
#4) 사용빈도가 떨어지는 문자열 조작 처리를 과감하게 제거하여 간략화시킴

#1. Installation
#install.packages("stringr")

library(stringr)

#2. Functions
#1) str_length(string): 문자열의 길이를 계산
#문자열의 길이를 계산해주는 함수
#base::nchar(x)와 같은 기능을 하는 함수
#단, NA 에 대해서는 2가 아닌 NA를 돌려줍니다.
str_length(c("i", "like", "programming", NA))
## [1]  1  4 11 NA
#> [1]  1  4 11 NA
nchar(c("i", "like", "programming", NA))
## [1]  1  4 11 NA
#> [1]  1  4 11  2

#2) str_sub(string, start=1, end=-1)
#문자열을 부분적으로 참조, 변경해주는 함수
#base::substr()와 같은 기능을 하는 함수
#음수를 사용하여 문자열의 끝으로 부터의 위치를 지정할 수 있습니다.
x <- "Michael Carreon"
str_sub(x,start=1,end=9)
## [1] "Michael C"
#> [1] "Michael C" * 띄어쓰기까지 포함하여 9번째 문자까지 반환해줍니다.
str_sub(x,1,9)
## [1] "Michael C"
#> [1] "Michael C" * start와 end는 쓰지 않아도 무방합니다.
str_sub(x,end=7)
## [1] "Michael"
#> [1] "Michael" * start 값을 지정해주지 않으면, default 값인 1로 지정됩니다. 즉, str_sub(x,1,7)과 같은 값이 반환됩니다.
str_sub(x,-7)
## [1] "Carreon"
#> [1] "Carreon" * 음수를 통하여 문자열 끝부터 7번째 오는 문자부터 반환해줍니다.
#Base R
substr(x,1,7)
## [1] "Michael"
#> [1] "Michael"

#3) str_c(..., sep='', collapse=NULL)
#문자열을 통합해주는 함수
#sep의 default가 스페이스 공백이 아니므로 base::paste0()와 비슷합니다.
str_c(letters[-26], " comes before ", letters[-1])
##  [1] "a comes before b" "b comes before c"
##  [3] "c comes before d" "d comes before e"
##  [5] "e comes before f" "f comes before g"
##  [7] "g comes before h" "h comes before i"
##  [9] "i comes before j" "j comes before k"
## [11] "k comes before l" "l comes before m"
## [13] "m comes before n" "n comes before o"
## [15] "o comes before p" "p comes before q"
## [17] "q comes before r" "r comes before s"
## [19] "s comes before t" "t comes before u"
## [21] "u comes before v" "v comes before w"
## [23] "w comes before x" "x comes before y"
## [25] "y comes before z"
#[1] "a comes before b" "b comes before c" "c comes before d" "d comes before e" "e comes before f"
#[6] "f comes before g" "g comes before h" "h comes before i" "i comes before j" "j comes before k"
#[11] "k comes before l" "l comes before m" "m comes before n" "n comes before o" "o comes before p"
#[16] "p comes before q" "q comes before r" "r comes before s" "s comes before t" "t comes before u"
#[21] "u comes before v" "v comes before w" "w comes before x" "x comes before y" "y comes before z"
##Base R
paste0(letters[-26], " comes before ", letters[-1])
##  [1] "a comes before b" "b comes before c"
##  [3] "c comes before d" "d comes before e"
##  [5] "e comes before f" "f comes before g"
##  [7] "g comes before h" "h comes before i"
##  [9] "i comes before j" "j comes before k"
## [11] "k comes before l" "l comes before m"
## [13] "m comes before n" "n comes before o"
## [15] "o comes before p" "p comes before q"
## [17] "q comes before r" "r comes before s"
## [19] "s comes before t" "t comes before u"
## [21] "u comes before v" "v comes before w"
## [23] "w comes before x" "x comes before y"
## [25] "y comes before z"
str_c(letters, collapse = ", ")
## [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
#> [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
#sep와 collapse의 차이는 한 벡터 안에 존재하느냐 아니냐입니다.
str_c("안","녕","하","세","요",sep="_")
## [1] "안_녕_하_세_요"
#> [1] "안_녕_하_세_요"  
str_c(c("안","녕","하","세","요"),collapse="_")
## [1] "안_녕_하_세_요"
#> [1] "안_녕_하_세_요"

#4) str_split(string, pattern, n=Inf)
#문자열을 분리해주는 함수--> 결과값은 list입니다.
#base::strsplit(x, split)와 대응하는 함수입니다.
#str_split_fixed()도 있고, 결과값은 matrix
fruits <- c("apples and oranges and pears and bananas", "pineapples and mangos and guavas")
str_split(fruits, " and ")
## [[1]]
## [1] "apples"  "oranges" "pears"   "bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"
#> [[1]]
#> [1] "apples"  "oranges" "pears"   "bananas"
#> 
#> [[2]]
#> [1] "pineapples" "mangos"     "guavas"
#Base R
strsplit(fruits, "and")
## [[1]]
## [1] "apples "   " oranges " " pears "   " bananas" 
## 
## [[2]]
## [1] "pineapples " " mangos "    " guavas"
#> [[1]]
#> [1] "apples "   " oranges " " pears "   " bananas" 
#> 
#> [[2]]
#> [1] "pineapples " " mangos "    " guavas"
str_split(fruits, " and ", n = 3)
## [[1]]
## [1] "apples"            "oranges"          
## [3] "pears and bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"
#> [[1]]
#> [1] "apples"            "oranges"           "pears and bananas"
#> 
#> [[2]]
#> [1] "pineapples" "mangos"     "guavas"
str_split(fruits, " and ", n = 2)
## [[1]]
## [1] "apples"                       
## [2] "oranges and pears and bananas"
## 
## [[2]]
## [1] "pineapples"        "mangos and guavas"
#> [[1]]
#> [1] "apples"                        "oranges and pears and bananas"
#> 
#> [[2]]
#> [1] "pineapples"        "mangos and guavas"
str_split_fixed(fruits, " and ", 4)
##      [,1]         [,2]      [,3]     [,4]     
## [1,] "apples"     "oranges" "pears"  "bananas"
## [2,] "pineapples" "mangos"  "guavas" ""
#>      [,1]         [,2]      [,3]     [,4]     
#> [1,] "apples"     "oranges" "pears"  "bananas"
#> [2,] "pineapples" "mangos"  "guavas" "

#5)str_detect(string, pattern)
#매치하는 곳이 있는지 없는지를 logical 값(True or False)으로 반환해주는 함수
#base::grepl(pattern, x)과 대응
fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "a")
## [1] TRUE TRUE TRUE TRUE
#> [1] TRUE TRUE TRUE TRUE
str_detect(fruit, "^a")
## [1]  TRUE FALSE FALSE FALSE
#> [1]  TRUE FALSE FALSE FALSE
str_detect(fruit, "a$")
## [1] FALSE  TRUE FALSE FALSE
#> [1] FALSE  TRUE FALSE FALSE
str_detect(fruit, "b")
## [1] FALSE  TRUE FALSE FALSE
#> [1] FALSE  TRUE FALSE FALSE
str_detect(fruit, "[aeiou]")
## [1] TRUE TRUE TRUE TRUE
#> [1] TRUE TRUE TRUE TRUE

#6) str_count(string, pattern)
#매치하는 곳의 수를 반환해주는 함수
#그 글자가 몇 개 포함되어 있는지 알려줍니다.
str_count(fruit, "p")
## [1] 2 0 1 3
#> [1] 2 0 1 3
str_count(fruit, c("a", "b", "p", "p"))
## [1] 1 1 1 3
#> [1] 1 1 1 3

#7)str_locate(string, pattern)
#처음으로 매치되는 곳의 start, end 위치를 행렬로 반환해주는 함수
str_locate(fruit, "e")
##      start end
## [1,]     5   5
## [2,]    NA  NA
## [3,]     2   2
## [4,]     8   8
#>      start end
#> [1,]     5   5
#> [2,]    NA  NA
#> [3,]     2   2
#> [4,]     8   8
str_locate(fruit, "pl")
##      start end
## [1,]     3   4
## [2,]    NA  NA
## [3,]    NA  NA
## [4,]     6   7
#>      start end
#> [1,]     3   4
#> [2,]    NA  NA
#> [3,]    NA  NA
#> [4,]     6   7

#8)str_extract(string, pattern)
#매치된 부분 문자열을 추출하는 함수
#매치되지 않은 요소는 NA로 출력합니다
#base::grep(pattern, x, value=TRUE)와 비슷하나 이 함수는 매치된 요소만 원래의 형태로 돌려줍니다
shopping_list <- c("apples x4", "flour", "sugar", "milk x2")
str_extract(shopping_list, "\\d")
## [1] "4" NA  NA  "2"
#> [1] "4" NA  NA  "2"
grep("\\d", shopping_list, value = TRUE)
## [1] "apples x4" "milk x2"
#> [1] "apples x4" "milk x2"
str_extract(shopping_list, "[a-z]+")
## [1] "apples" "flour"  "sugar"  "milk"
#> [1] "apples" "flour"  "sugar"  "milk"
grep("[a-z]+", shopping_list, value = TRUE)
## [1] "apples x4" "flour"     "sugar"     "milk x2"
#> [1] "apples x4" "flour"     "sugar"     "milk x2"

#9)str_match(string, pattern)
#매치된 부분 문자열을 추출하고 참조를 행렬로 돌려주는 함수
#str_extract 함수의 결과를 1열에 , 각 괄호에 매치된 이후의 결과가 2열 이후에 들어갑니다.
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", "387 287 6718", "apple", "233.398.9187  ", "482 952 3315", "239 923 8115", "842 566 4692", "Work: 579-499-7527", "$1000", "Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
str_extract(strings, phone)
##  [1] "219 733 8965" "329-293-8753" NA            
##  [4] "595 794 7569" "387 287 6718" NA            
##  [7] "233.398.9187" "482 952 3315" "239 923 8115"
## [10] "842 566 4692" "579-499-7527" NA            
## [13] "543.355.3679"
#>  [1] "219 733 8965" "329-293-8753" NA             "595 794 7569"
#>  [5] "387 287 6718" NA             "233.398.9187" "482 952 3315"
#>  [9] "239 923 8115" "842 566 4692" "579-499-7527" NA            
#> [13] "543.355.3679"
str_match(strings, phone)
##       [,1]           [,2]  [,3]  [,4]  
##  [1,] "219 733 8965" "219" "733" "8965"
##  [2,] "329-293-8753" "329" "293" "8753"
##  [3,] NA             NA    NA    NA    
##  [4,] "595 794 7569" "595" "794" "7569"
##  [5,] "387 287 6718" "387" "287" "6718"
##  [6,] NA             NA    NA    NA    
##  [7,] "233.398.9187" "233" "398" "9187"
##  [8,] "482 952 3315" "482" "952" "3315"
##  [9,] "239 923 8115" "239" "923" "8115"
## [10,] "842 566 4692" "842" "566" "4692"
## [11,] "579-499-7527" "579" "499" "7527"
## [12,] NA             NA    NA    NA    
## [13,] "543.355.3679" "543" "355" "3679"
#>       [,1]           [,2]  [,3]  [,4]  
#>  [1,] "219 733 8965" "219" "733" "8965"
#>  [2,] "329-293-8753" "329" "293" "8753"
#>  [3,] NA             NA    NA    NA    
#>  [4,] "595 794 7569" "595" "794" "7569"
#>  [5,] "387 287 6718" "387" "287" "6718"
#>  [6,] NA             NA    NA    NA    
#>  [7,] "233.398.9187" "233" "398" "9187"
#>  [8,] "482 952 3315" "482" "952" "3315"
#>  [9,] "239 923 8115" "239" "923" "8115"
#> [10,] "842 566 4692" "842" "566" "4692"
#> [11,] "579-499-7527" "579" "499" "7527"
#> [12,] NA             NA    NA    NA    
#> [13,] "543.355.3679" "543" "355" "3679"

#10)str_replace(string, pattern, replacement)
#매치되지 않은 부분은 그대로 두고 매치된 부분만 치환하는 함수
#base::sub(매치할 부분,치환할 문자,문자열)와 같은 기능을 합니다.
fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-")
## [1] "-ne apple"     "tw- pears"     "thr-e bananas"
#> [1] "-ne apple"     "tw- pears"     "thr-e bananas"
str_replace_all(fruits, "[aeiou]", "-")
## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
#> [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

#11)str_trim(string, side="both")
#공백문자를 제거하는 함수
str_trim("        fruits        ", side="both")
## [1] "fruits"
#>[1] "fruits" 
Trim = function(x) gsub("^\\s+|\\s+$", "", x)
Trim("        fruits        ")
## [1] "fruits"
#>[1] "fruits"

#"lubridate"
#lubri:lubricate(기름을 치다, 기름을 바르다, 원활히 하다)+date
#Lubridate is an R package that makes it easier to work with dates and times 
#1.Installation
install.packages("lubridate")
## Installing package into 'C:/Users/mdlhs/Rlib'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(lubridate)
## Loading required package: methods
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
#2.Functions
#1)Parsing dates and times(dates & times 객체 만들기)
##Date
#Base R
as.Date("2011-06-04")
## [1] "2011-06-04"
## [1] "2011-06-04"
as.Date("2011-6-4")
## [1] "2011-06-04"
## [1] "2011-06-04"
as.Date("2011/06/04")
## [1] "2011-06-04"
## [1] "2011-06-04"
## 
as.Date("20110604") # error 
## Error in charToDate(x): character string is not in a standard unambiguous format
as.Date("06-04-2011") ### [1] "0006-04-20" (미국식 표현) #Problem
## [1] "0006-04-20"
#lubridate package 
ymd("2011/06/04")
## [1] "2011-06-04"
## [1] "2011-06-04"
#심볼의 순서를 바꾸어도
mdy("06/04/2011")
## [1] "2011-06-04"
## [1] "2011-06-04"
dmy("04/06/2011")
## [1] "2011-06-04"
## [1] "2011-06-04"

#lubridate에서의 날짜 양식의 관용
#heterogeneous format(불균일한 양식)에 대한 다양한 준비들이 되어있음
ymd("2011/06/04")
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("2011-06-04")
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("20110604")
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("110604")
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("11.06.04")
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("11,06,04")
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("11_06.04") 
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("2011  06  04") 
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("2011!?06??!04") 
## [1] "2011-06-04"
## [1] "2011-06-04"
ymd("2011 =06??04") 
## [1] "2011-06-04"
## [1] "2011-06-04"

##Dates + Times 객체 만들기
#Base R
as.POSIXct("2011-06-04 13:30:50")
## [1] "2011-06-04 13:30:50 KST"
## [1] "2011-06-04 13:30:50 KST"
as.POSIXct("2011-06-04 13") # No
## [1] "2011-06-04 KST"
## [1] "2011-06-04 KST"
strptime("2011-06-04 13:30:50", "%Y-%m-%d %H:%M:%S")
## [1] "2011-06-04 13:30:50 KST"
## [1] "2011-06-04 13:30:50 KST"

#lubridate package
ymd_hms("2011-06-04 13:30:50")
## [1] "2011-06-04 13:30:50 UTC"
## [1] "2011-06-04 13:30:50 UTC"
#조금 더 융통성이 있게 사용할 수 있는 점
ymd_h("2011-06-04 13")
## [1] "2011-06-04 13:00:00 UTC"
## [1] "2011-06-04 13:00:00 UTC"

#2)Setting and Extracting information
#부분정보를 추출하기 위한 간편 함수들
#함수명칭도 상식적으로 이해하기 쉬운 것들
#:second(), minute(), hour(), day(), wday(), yday(), week(), month(), year()
ld1 <- ymd_hms("2011-06-04 13:30:50")

year(ld1)
## [1] 2011
## [1] 2011
month(ld1)
## [1] 6
## [1] 6
day(ld1)
## [1] 4
## [1] 4
wday(ld1)
## [1] 7
## [1] 7
yday(ld1)
## [1] 155
## [1] 155
hour(ld1)
## [1] 13
## [1] 13
minute(ld1)
## [1] 30
## [1] 30
second(ld1)
## [1] 50
## [1] 50

# month, wday 의 경우 label 인자를 가지고 있는데 이를 TRUE 로 설정할 경우
month(ld1, label = T)
## [1] 6
## 12 Levels: 1 < 2 < 3 < 4 < 5 < 6 < 7 < 8 < ... < 12
## [1] Jun
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(ld1, label = T)
## [1] 토
## Levels: 일 < 월 < 화 < 수 < 목 < 금 < 토
## [1] Sat
## Levels: Sun < Mon < Tues < Wed < Thurs < Fri < Sat

#3)Update date-time
#"2011년 6월 4일 13:30:50" 로 저장되어있던 ld1 에 대해 시각(hour)을 10시로 바꾸려면 
hour(ld1) <- 10
ld1
## [1] "2011-06-04 10:30:50 UTC"
## [1] "2011-06-04 10:30:50 UTC"

#update() 함수를 이용해 10시로 변경된 ld1 을 다시 13로
ld1 <- update(ld1, hour = 13)
ld1
## [1] "2011-06-04 13:30:50 UTC"
## [1] "2011-06-04 13:30:50 UTC"

#4) Arithmetic with date times
#lubridate 와 같은 패키지를 공부하는 목적 중 가장 중요한 특징
#산술연산에서 사용할 수 있는 패밀리:간편함수마지막에 "s" 가 붙음으로써 쓰임이 달라진 것
#days(), seconds(), minutes(), hours(), weeks(), years(), milliseconds(), microseconds(), nanoseconds(), picoseconds()
ymd("2016-01-30") + days(2)
## [1] "2016-02-01"
## [1] "2016-02-01"
ymd("2016-01-30") - days(1:30)
##  [1] "2016-01-29" "2016-01-28" "2016-01-27"
##  [4] "2016-01-26" "2016-01-25" "2016-01-24"
##  [7] "2016-01-23" "2016-01-22" "2016-01-21"
## [10] "2016-01-20" "2016-01-19" "2016-01-18"
## [13] "2016-01-17" "2016-01-16" "2016-01-15"
## [16] "2016-01-14" "2016-01-13" "2016-01-12"
## [19] "2016-01-11" "2016-01-10" "2016-01-09"
## [22] "2016-01-08" "2016-01-07" "2016-01-06"
## [25] "2016-01-05" "2016-01-04" "2016-01-03"
## [28] "2016-01-02" "2016-01-01" "2015-12-31"
##  [1] "2016-01-29" "2016-01-28" "2016-01-27" "2016-01-26" "2016-01-25"
##  [6] "2016-01-24" "2016-01-23" "2016-01-22" "2016-01-21" "2016-01-20"
## [11] "2016-01-19" "2016-01-18" "2016-01-17" "2016-01-16" "2016-01-15"
## [16] "2016-01-14" "2016-01-13" "2016-01-12" "2016-01-11" "2016-01-10"
## [21] "2016-01-09" "2016-01-08" "2016-01-07" "2016-01-06" "2016-01-05"
## [26] "2016-01-04" "2016-01-03" "2016-01-02" "2016-01-01" "2015-12-31"
ymd("2013-01-31") + months(0:11)
##  [1] "2013-01-31" NA           "2013-03-31"
##  [4] NA           "2013-05-31" NA          
##  [7] "2013-07-31" "2013-08-31" NA          
## [10] "2013-10-31" NA           "2013-12-31"
##  [1] "2013-01-31" NA           "2013-03-31" NA           "2013-05-31"
##  [6] NA           "2013-07-31" "2013-08-31" NA           "2013-10-31"
## [11] NA           "2013-12-31"

#5) Application with lubridate and dplyr
#lubridate package 에 내장된 데이터셋 lakers 를 이용
#data(lakers)
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Korean_Korea.949 
## [2] LC_CTYPE=Korean_Korea.949   
## [3] LC_MONETARY=Korean_Korea.949
## [4] LC_NUMERIC=C                
## [5] LC_TIME=Korean_Korea.949    
## 
## attached base packages:
## [1] methods   stats     graphics  grDevices utils    
## [6] datasets  base     
## 
## other attached packages:
## [1] lubridate_1.7.1  stringr_1.2.0    lattice_0.20-35 
## [4] gdtools_0.1.6    dplyr_0.7.4.9000 readxl_1.0.0    
## [7] knitr_1.17.20   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14     rstudioapi_0.7   bindr_0.1       
##  [4] magrittr_1.5     tidyselect_0.2.3 R6_2.2.2        
##  [7] rlang_0.1.4      tools_3.4.3      grid_3.4.3      
## [10] htmltools_0.3.6  yaml_2.1.16      rprojroot_1.3-1 
## [13] digest_0.6.13    assertthat_0.2.0 tibble_1.3.4    
## [16] bookdown_0.5.10  bindrcpp_0.2     purrr_0.2.4.9000
## [19] glue_1.2.0       evaluate_0.10.1  rmarkdown_1.8.3 
## [22] stringi_1.1.6    compiler_3.4.3   cellranger_1.1.0
## [25] backports_1.1.2  svglite_1.2.1    pkgconfig_2.0.1
lakers <- lakers %>% tbl_df
lakers #--> date, time 변수가 서로 나뉘어 있다.
## # A tibble: 34,624 x 13
##        date opponent game_type  time period      etype
##       <int>    <chr>     <chr> <chr>  <int>      <chr>
##  1 20081028      POR      home 12:00      1  jump ball
##  2 20081028      POR      home 11:39      1       shot
##  3 20081028      POR      home 11:37      1    rebound
##  4 20081028      POR      home 11:25      1       shot
##  5 20081028      POR      home 11:23      1    rebound
##  6 20081028      POR      home 11:22      1       shot
##  7 20081028      POR      home 11:22      1       foul
##  8 20081028      POR      home 11:22      1 free throw
##  9 20081028      POR      home 11:00      1       foul
## 10 20081028      POR      home 10:53      1       shot
## # ... with 34,614 more rows, and 7 more variables:
## #   team <chr>, player <chr>, result <chr>,
## #   points <int>, type <chr>, x <int>, y <int>
## # A tibble: 34,624 <U+00D7> 13
##        date opponent game_type  time period      etype  team
##       <int>    <chr>     <chr> <chr>  <int>      <chr> <chr>
## 1  20081028      POR      home 12:00      1  jump ball   OFF
## 2  20081028      POR      home 11:39      1       shot   LAL
## 3  20081028      POR      home 11:37      1    rebound   LAL
## 4  20081028      POR      home 11:25      1       shot   LAL
## 5  20081028      POR      home 11:23      1    rebound   LAL
## 6  20081028      POR      home 11:22      1       shot   LAL
## 7  20081028      POR      home 11:22      1       foul   POR
## 8  20081028      POR      home 11:22      1 free throw   LAL
## 9  20081028      POR      home 11:00      1       foul   LAL
## 10 20081028      POR      home 10:53      1       shot   POR
## # ... with 34,614 more rows, and 6 more variables: player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>, y <int>

lakers <- lakers %>% 
    mutate(date = paste(date, time) %>% ymd_hm) %>% 
    dplyr::rename(time_index = date) %>% 
    select(-time)

#date, time 두변수를 붙인 문자열에 대해 ymd_hm() 함수로 넘긴 후
#time_index 라는 변수에 담고,
#date, time 두 변수를 제외한 것이다.

lakers
## # A tibble: 34,624 x 12
##             time_index opponent game_type period
##                 <dttm>    <chr>     <chr>  <int>
##  1 2008-10-28 12:00:00      POR      home      1
##  2 2008-10-28 11:39:00      POR      home      1
##  3 2008-10-28 11:37:00      POR      home      1
##  4 2008-10-28 11:25:00      POR      home      1
##  5 2008-10-28 11:23:00      POR      home      1
##  6 2008-10-28 11:22:00      POR      home      1
##  7 2008-10-28 11:22:00      POR      home      1
##  8 2008-10-28 11:22:00      POR      home      1
##  9 2008-10-28 11:00:00      POR      home      1
## 10 2008-10-28 10:53:00      POR      home      1
## # ... with 34,614 more rows, and 8 more variables:
## #   etype <chr>, team <chr>, player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>,
## #   y <int>
## # A tibble: 34,624 <U+00D7> 12
##             time_index opponent game_type period      etype  team
##                 <dttm>    <chr>     <chr>  <int>      <chr> <chr>
## 1  2008-10-28 12:00:00      POR      home      1  jump ball   OFF
## 2  2008-10-28 11:39:00      POR      home      1       shot   LAL
## 3  2008-10-28 11:37:00      POR      home      1    rebound   LAL
## 4  2008-10-28 11:25:00      POR      home      1       shot   LAL
## 5  2008-10-28 11:23:00      POR      home      1    rebound   LAL
## 6  2008-10-28 11:22:00      POR      home      1       shot   LAL
## 7  2008-10-28 11:22:00      POR      home      1       foul   POR
## 8  2008-10-28 11:22:00      POR      home      1 free throw   LAL
## 9  2008-10-28 11:00:00      POR      home      1       foul   LAL
## 10 2008-10-28 10:53:00      POR      home      1       shot   POR
## # ... with 34,614 more rows, and 6 more variables: player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>, y <int>

#Using "group by" 월별 평균을 x, y 변수에 대해서 계산:month() 함수를 이용cf) 연별 평균을 계산하고 싶다면 year() 이용
lakers %>% 
    group_by(month(time_index)) %>% 
    summarize(mean_x = mean(x, na.rm = T), mean_y = mean(y, na.rm = T))
## # A tibble: 7 x 3
##   `month(time_index)` mean_x mean_y
##                 <dbl>  <dbl>  <dbl>
## 1                   1  25.49  13.89
## 2                   2  25.02  13.17
## 3                   3  25.52  13.21
## 4                   4  25.38  13.46
## 5                  10  24.92  13.12
## 6                  11  25.47  13.37
## 7                  12  25.06  13.48
## # A tibble: 7 <U+00D7> 3
##   `month(time_index)`   mean_x   mean_y
##                 <dbl>    <dbl>    <dbl>
## 1                   1 25.49382 13.89279
## 2                   2 25.01759 13.17499
## 3                   3 25.51587 13.20571
## 4                   4 25.38344 13.46396
## 5                  10 24.92188 13.12500
## 6                  11 25.47463 13.36926
## 7                  12 25.05895 13.48262


#Using "filter" "2008-10-28 12:00:00" 이전의 기간을 서브세팅
lakers %>% 
    filter(time_index <= ymd_hms("2008-10-28 12:00:00"))
## # A tibble: 416 x 12
##             time_index opponent game_type period
##                 <dttm>    <chr>     <chr>  <int>
##  1 2008-10-28 12:00:00      POR      home      1
##  2 2008-10-28 11:39:00      POR      home      1
##  3 2008-10-28 11:37:00      POR      home      1
##  4 2008-10-28 11:25:00      POR      home      1
##  5 2008-10-28 11:23:00      POR      home      1
##  6 2008-10-28 11:22:00      POR      home      1
##  7 2008-10-28 11:22:00      POR      home      1
##  8 2008-10-28 11:22:00      POR      home      1
##  9 2008-10-28 11:00:00      POR      home      1
## 10 2008-10-28 10:53:00      POR      home      1
## # ... with 406 more rows, and 8 more variables:
## #   etype <chr>, team <chr>, player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>,
## #   y <int>
## # A tibble: 416 <U+00D7> 12
##             time_index opponent game_type period      etype  team
##                 <dttm>    <chr>     <chr>  <int>      <chr> <chr>
## 1  2008-10-28 12:00:00      POR      home      1  jump ball   OFF
## 2  2008-10-28 11:39:00      POR      home      1       shot   LAL
## 3  2008-10-28 11:37:00      POR      home      1    rebound   LAL
## 4  2008-10-28 11:25:00      POR      home      1       shot   LAL
## 5  2008-10-28 11:23:00      POR      home      1    rebound   LAL
## 6  2008-10-28 11:22:00      POR      home      1       shot   LAL
## 7  2008-10-28 11:22:00      POR      home      1       foul   POR
## 8  2008-10-28 11:22:00      POR      home      1 free throw   LAL
## 9  2008-10-28 11:00:00      POR      home      1       foul   LAL
## 10 2008-10-28 10:53:00      POR      home      1       shot   POR
## # ... with 406 more rows, and 6 more variables: player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>, y <int>

# "2008-10-28 12:00:00" ~ "2009-03-09 00:33:00" 의 기간에 대해서 서브세팅
lakers %>% 
    filter(time_index >= ymd_hms("2008-10-28 12:00:00"), time_index <= ymd_hms("2009-03-09 00:33:00"))
## # A tibble: 25,554 x 12
##             time_index opponent game_type period
##                 <dttm>    <chr>     <chr>  <int>
##  1 2008-10-28 12:00:00      POR      home      1
##  2 2008-10-29 12:00:00      LAC      away      1
##  3 2008-10-29 11:36:00      LAC      away      1
##  4 2008-10-29 11:24:00      LAC      away      1
##  5 2008-10-29 11:24:00      LAC      away      1
##  6 2008-10-29 11:08:00      LAC      away      1
##  7 2008-10-29 10:58:00      LAC      away      1
##  8 2008-10-29 10:57:00      LAC      away      1
##  9 2008-10-29 10:41:00      LAC      away      1
## 10 2008-10-29 10:40:00      LAC      away      1
## # ... with 25,544 more rows, and 8 more variables:
## #   etype <chr>, team <chr>, player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>,
## #   y <int>
#interval() 함수와 %within% 연산자를 이용하면 조금 더 직관적인 서브세팅(interval() 함수대신 %--% 연산자를 써도 된다)
inter <- interval(ymd_hms("2008-10-28 12:00:00"), ymd_hms("2009-03-09 00:33:00"))
lakers %>% 
    filter(time_index %within% inter)
## # A tibble: 25,554 x 12
##             time_index opponent game_type period
##                 <dttm>    <chr>     <chr>  <int>
##  1 2008-10-28 12:00:00      POR      home      1
##  2 2008-10-29 12:00:00      LAC      away      1
##  3 2008-10-29 11:36:00      LAC      away      1
##  4 2008-10-29 11:24:00      LAC      away      1
##  5 2008-10-29 11:24:00      LAC      away      1
##  6 2008-10-29 11:08:00      LAC      away      1
##  7 2008-10-29 10:58:00      LAC      away      1
##  8 2008-10-29 10:57:00      LAC      away      1
##  9 2008-10-29 10:41:00      LAC      away      1
## 10 2008-10-29 10:40:00      LAC      away      1
## # ... with 25,544 more rows, and 8 more variables:
## #   etype <chr>, team <chr>, player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>,
## #   y <int>
## # A tibble: 25,554 <U+00D7> 12
##             time_index opponent game_type period     etype  team
##                 <dttm>    <chr>     <chr>  <int>     <chr> <chr>
## 1  2008-10-28 12:00:00      POR      home      1 jump ball   OFF
## 2  2008-10-29 12:00:00      LAC      away      1 jump ball   OFF
## 3  2008-10-29 11:36:00      LAC      away      1      shot   LAL
## 4  2008-10-29 11:24:00      LAC      away      1      shot   LAC
## 5  2008-10-29 11:24:00      LAC      away      1   rebound   LAL
## 6  2008-10-29 11:08:00      LAC      away      1      shot   LAL
## 7  2008-10-29 10:58:00      LAC      away      1      shot   LAC
## 8  2008-10-29 10:57:00      LAC      away      1   rebound   LAL
## 9  2008-10-29 10:41:00      LAC      away      1      shot   LAL
## 10 2008-10-29 10:40:00      LAC      away      1   rebound   LAC
## # ... with 25,544 more rows, and 6 more variables: player <chr>,
## #   result <chr>, points <int>, type <chr>, x <int>, y <int>