R Lesson #7 - Searching characters
This lesson describes how to search for a pattern inside of a character vector. R has several functions that accept search patterns known as regular expressions. The different search functions are detailed below.123456-789101112131415-1617181920212223# FUNCTION OUTPUT_TYPE DESCRIPTION# strsplit list explode string at matches# gsub character replace matches# grep integer return index of matches# grepl logical TRUE if matched# gregexpr list position/length of match(es) x <- "The quick brown fox jumps over the lazy dog."y <- strsplit(x, " ")yy <- y[[1]]grep("u", y) # find "u"gsub("o", "OOOO", y) # replace "o" with "OOOO"grepl("u", y) # find "u"which(grepl("u", y)) # equivalent to grepstrsplit(y, "u") # returns a list # to figure out where the match occurred,# we can use gregexpr, which returns a# somewhat complicated list structureg <- gregexpr("u", y) # returns a liststr(g)g # position of matches or -1 (no match)attributes(g[[2]])$match.lengthregmatches(y, g) # get back the matches
242526272829303132test1 <- c("abc", "Abc", "ABC", "ABCD")grep("abc", test1) # with the default argumentsgrep("abc", test1, value=TRUE)# with only letters the behavior is as expected,# special characters will be interpreted as# regular expressions (unless fixed=TRUE)grep("abc", test1, fixed=TRUE) # search exactly (fast)grep("abc", test1, ignore.case=TRUE)grep("abc", test1, invert=TRUE)
33343536-37383940414243444546474849505152-53545556575859-6061626364656667-686970717273747576-7778798081828384858687?regex# these can get very advanced, for example:# "/^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/"# can be used to match any email address # Repetition qualifiers:# "." matches anything once# "+" suffix means one or more times# "*" suffix means zero or more times# "?" suffix means zero or one timetest2 <- c("m+n", "man", "moon", "mn", "mooon", "mon")# zero or more 'o' characters between m and ngrep("mo*n", test2, value=TRUE)# one or more 'o' characters between m and ngrep("mo+n", test2, value=TRUE)# anything between m and n oncegrep("m.n", test2, value=TRUE)# zero or more characters between m and ngrep("m.*n", test2, value=TRUE)# zero or one 'o' characters between m and ngrep("mo?n", test2, value=TRUE) # special cases:# search for a special character (+) as isgrep("m+n", test2, value=TRUE)grep("m\\+n", test2, value=TRUE)grep("m+n", test2, fixed=TRUE, value=TRUE)# 2, 3 or 4 'o' characters between m and ngrep("mo{2,4}n", test2, value=TRUE) # Anchoring: beginning and ending# "^" means must start at the beginning# "$" means must finish at the endtest3 <- c("abc", "abc d", "abcd", "fabc")grep("abc", test3, value=TRUE)grep("^abc", test3, value=TRUE)grep("abc$", test3, value=TRUE)grep("^abc$", test3, value=TRUE) # Alternation constructs - either/ortest4 <- c("men", "man", "women", "moon", "maan", "mn")# the pipe "|" means ORgrep("e|a", test4, value=TRUE)# combining multiple special characters together# use parentheses to group part of the expressiongrep("m(e|a)+n", test4, value=TRUE)# parentheses can also group multiple charactersgrep("(en|an)", test4, value=TRUE) # Character classes - matching in a grouptest5 <- c("moon", "maan", "mn", "myn", "m_n", "mDn")# square brackets "[]" define a "character class":# only vowels located between m and ngrep("m[aeiou]*n", test5, value=TRUE)# the "^" in a character class means NOT# anything but a vowel located between m and ngrep("m[^aeiou]*n", test5, value=TRUE)# the "-" in a character class defines a range# only lowercase consonants located between m and ngrep("m[b-df-hj-np-tv-z]*n", test5, value=TRUE)
888990919293949596979899100101102103104105106107108109110-111112113114115116117118119120121# Built-in classes: \d, \wsentence <- "pi is approximately equal to 3.14 or 22/7."# match numbers without escaping "."nlist_matches1 <- gregexpr("\\d+.\\d+", sentence)nlist1 <- regmatches(sentence, nlist_matches1)nlist1# match numbers while escaping "."nlist_matches2 <- gregexpr("\\d+\\.\\d+", sentence)nlist2 <- regmatches(sentence, nlist_matches2)nlist2# list of all wordswlist_matches1 <- gregexpr("\\w+", sentence)wlist1 <- regmatches(sentence, wlist_matches1)wlist1# words with only letterswlist_matches2 <- gregexpr("[[:alpha:]]+", sentence)wlist2 <- regmatches(sentence, wlist_matches2)wlist2# words starting with an Upper case letterwlist_matches3 <- gregexpr("[[:upper:]][[:alpha:]]+", sentence)wlist3 <- regmatches(sentence, wlist_matches3)wlist3 # Word boundaries# words starting with an "a"wlist_matches4 <- gregexpr("\\<(a|A)[[:alpha:]]+", sentence)wlist4 <- regmatches(sentence, wlist_matches4)wlist4# words ending in a vowelwlist_matches5 <- gregexpr("[[:alpha:]]+[aeiou]\\>", sentence)wlist5 <- regmatches(sentence, wlist_matches5)wlist5