by Guangming Lang
1 min read

Categories

  • r

When doing data cleaning, we often have to clean up strings of text. Doing this in R used to be a pain until the birth of the stringr package. For example, it contains a function called str_trim() that allows you to easily remove any leading and trailing whitespace of a string. It also contains a function called str_sub() that allows you to easily extract substrings from any string. Run the following code to see these two functions work.

library(stringr)
gmlang = "\t Guangming Lang \n"

# trim whitespace on both sides
str_trim(gmlang)
## [1] "Guangming Lang"
# trim whitespace on the left side
str_trim(gmlang, side="left")
## [1] "Guangming Lang \n"
# trim whitespace on the right side
str_trim(gmlang, side="right")
## [1] "\t Guangming Lang"
# re-assign trimmed value to gmlang
gmlang = str_trim(gmlang)

# extract first name
str_sub(gmlang, start=1, end=9)
## [1] "Guangming"
str_sub(gmlang, end=9)
## [1] "Guangming"
# extract last name
str_sub(gmlang, start=11, end=14)
## [1] "Lang"
str_sub(gmlang, start=11)
## [1] "Lang"
# extact first and last name at the same time
str_sub(gmlang, start=c(1, 11), end=c(9, 14))
## [1] "Guangming" "Lang"
# something fun :)
str_sub(gmlang, start = seq_len(str_length(gmlang)))
##  [1] "Guangming Lang" "uangming Lang"  "angming Lang"   "ngming Lang"   
##  [5] "gming Lang"     "ming Lang"      "ing Lang"       "ng Lang"       
##  [9] "g Lang"         " Lang"          "Lang"           "ang"           
## [13] "ng"             "g"
str_sub(gmlang, end = seq_len(str_length(gmlang)))
##  [1] "G"              "Gu"             "Gua"            "Guan"          
##  [5] "Guang"          "Guangm"         "Guangmi"        "Guangmin"      
##  [9] "Guangming"      "Guangming "     "Guangming L"    "Guangming La"  
## [13] "Guangming Lan"  "Guangming Lang"