When doing data cleaning, we often have to clean up strings of text. Doing this in R used to be a pain until the birth of the stringr package. For example, it contains a function called str_trim()
that allows you to easily remove any leading and trailing whitespace of a string. It also contains a function called str_sub()
that allows you to easily extract substrings from any string. Run the following code to see these two functions work.
library(stringr)
gmlang = "\t Guangming Lang \n"
# trim whitespace on both sides
str_trim(gmlang)
## [1] "Guangming Lang"
# trim whitespace on the left side
str_trim(gmlang, side="left")
## [1] "Guangming Lang \n"
# trim whitespace on the right side
str_trim(gmlang, side="right")
## [1] "\t Guangming Lang"
# re-assign trimmed value to gmlang
gmlang = str_trim(gmlang)
# extract first name
str_sub(gmlang, start=1, end=9)
## [1] "Guangming"
str_sub(gmlang, end=9)
## [1] "Guangming"
# extract last name
str_sub(gmlang, start=11, end=14)
## [1] "Lang"
str_sub(gmlang, start=11)
## [1] "Lang"
# extact first and last name at the same time
str_sub(gmlang, start=c(1, 11), end=c(9, 14))
## [1] "Guangming" "Lang"
# something fun :)
str_sub(gmlang, start = seq_len(str_length(gmlang)))
## [1] "Guangming Lang" "uangming Lang" "angming Lang" "ngming Lang"
## [5] "gming Lang" "ming Lang" "ing Lang" "ng Lang"
## [9] "g Lang" " Lang" "Lang" "ang"
## [13] "ng" "g"
str_sub(gmlang, end = seq_len(str_length(gmlang)))
## [1] "G" "Gu" "Gua" "Guan"
## [5] "Guang" "Guangm" "Guangmi" "Guangmin"
## [9] "Guangming" "Guangming " "Guangming L" "Guangming La"
## [13] "Guangming Lan" "Guangming Lang"