DECIPHER logo

  • Alignment▸
  • Classification▸
  • Homology▸
  • Oligo Design▸
  • Phylogenetics▸
  • Tutorials▾
  • Examples Gallery
  • Documentation
  • R Lessons
  • Bioinformatics
  • Home
  • News
  • Downloads
  • Contact
  • Citations

R Lesson #7 - Searching characters

This lesson describes how to search for a pattern inside of a character vector. R has several functions that accept search patterns known as regular expressions. The different search functions are detailed below.

Show output
123456-789



1011
12


13

14
15


























-1617181920



























21






























































22
23

























# FUNCTION OUTPUT_TYPE DESCRIPTION# strsplit list explode string at matches# gsub character replace matches# grep integer return index of matches# grepl logical TRUE if matched# gregexpr list position/length of match(es) x <- "The quick brown fox jumps over the lazy dog."y <- strsplit(x, " ")y[[1]][1] "The" "quick" "brown" "fox" "jumps" "over" [7] "the" "lazy" "dog."
y <- y[[1]]grep("u", y) # find "u"[1] 2 5gsub("o", "OOOO", y) # replace "o" with "OOOO"[1] "The" "quick" "brOOOOwn" "fOOOOx" [5] "jumps" "OOOOver" "the" "lazy" [9] "dOOOOg." grepl("u", y) # find "u"[1] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE[9] FALSEwhich(grepl("u", y)) # equivalent to grep[1] 2 5strsplit(y, "u") # returns a list[[1]][1] "The"
[[2]][1] "q" "ick"
[[3]][1] "brown"
[[4]][1] "fox"
[[5]][1] "j" "mps"
[[6]][1] "over"
[[7]][1] "the"
[[8]][1] "lazy"
[[9]][1] "dog."
# to figure out where the match occurred,# we can use gregexpr, which returns a# somewhat complicated list structureg <- gregexpr("u", y) # returns a liststr(g)List of 9 $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] 2 ..- attr(*, "match.length")= int 1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] 2 ..- attr(*, "match.length")= int 1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUE $ : atomic [1:1] -1 ..- attr(*, "match.length")= int -1 ..- attr(*, "useBytes")= logi TRUEg # position of matches or -1 (no match)[[1]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
[[2]][1] 2attr(,"match.length")[1] 1attr(,"useBytes")[1] TRUE
[[3]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
[[4]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
[[5]][1] 2attr(,"match.length")[1] 1attr(,"useBytes")[1] TRUE
[[6]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
[[7]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
[[8]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
[[9]][1] -1attr(,"match.length")[1] -1attr(,"useBytes")[1] TRUE
attributes(g[[2]])$match.length[1] 1regmatches(y, g) # get back the matches[[1]]character(0)
[[2]][1] "u"
[[3]]character(0)
[[4]]character(0)
[[5]][1] "u"
[[6]]character(0)
[[7]]character(0)
[[8]]character(0)
[[9]]character(0)
Finding an exact pattern, also known as literal matching, is very fast. Depending on the input parameters, the grep function can return the indices of matches, the indices of non-matches, or the match value.
2425
26
27282930
31
32
test1 <- c("abc", "Abc", "ABC", "ABCD")grep("abc", test1) # with the default arguments[1] 1grep("abc", test1, value=TRUE)[1] "abc"# with only letters the behavior is as expected,# special characters will be interpreted as# regular expressions (unless fixed=TRUE)grep("abc", test1, fixed=TRUE) # search exactly (fast)[1] 1grep("abc", test1, ignore.case=TRUE)[1] 1 2 3 4grep("abc", test1, invert=TRUE)[1] 2 3 4
A regular expression is a grammer specifying the syntax for flexible pattern matching. Only the basics of regular expressions are described here.
33343536-3738394041424344
4546
4748
4950
5152
-535455
56
57
5859
-6061626364
65
66
67
-68697071
727374
7576
-7778798081
828384
858687
?regex# these can get very advanced, for example:# "/^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/"# can be used to match any email address # Repetition qualifiers:# "." matches anything once# "+" suffix means one or more times# "*" suffix means zero or more times# "?" suffix means zero or one timetest2 <- c("m+n", "man", "moon", "mn", "mooon", "mon")# zero or more 'o' characters between m and ngrep("mo*n", test2, value=TRUE)[1] "moon" "mn" "mooon" "mon" # one or more 'o' characters between m and ngrep("mo+n", test2, value=TRUE)[1] "moon" "mooon" "mon" # anything between m and n oncegrep("m.n", test2, value=TRUE)[1] "m+n" "man" "mon"# zero or more characters between m and ngrep("m.*n", test2, value=TRUE)[1] "m+n" "man" "moon" "mn" "mooon" "mon" # zero or one 'o' characters between m and ngrep("mo?n", test2, value=TRUE)[1] "mn" "mon" # special cases:# search for a special character (+) as isgrep("m+n", test2, value=TRUE)[1] "mn"grep("m\\+n", test2, value=TRUE)[1] "m+n"grep("m+n", test2, fixed=TRUE, value=TRUE)[1] "m+n"# 2, 3 or 4 'o' characters between m and ngrep("mo{2,4}n", test2, value=TRUE)[1] "moon" "mooon" # Anchoring: beginning and ending# "^" means must start at the beginning# "$" means must finish at the endtest3 <- c("abc", "abc d", "abcd", "fabc")grep("abc", test3, value=TRUE)[1] "abc" "abc d" "abcd" "fabc" grep("^abc", test3, value=TRUE)[1] "abc" "abc d" "abcd" grep("abc$", test3, value=TRUE)[1] "abc" "fabc"grep("^abc$", test3, value=TRUE)[1] "abc" # Alternation constructs - either/ortest4 <- c("men", "man", "women", "moon", "maan", "mn")# the pipe "|" means ORgrep("e|a", test4, value=TRUE)[1] "men" "man" "women" "maan" # combining multiple special characters together# use parentheses to group part of the expressiongrep("m(e|a)+n", test4, value=TRUE)[1] "men" "man" "women" "maan" # parentheses can also group multiple charactersgrep("(en|an)", test4, value=TRUE)[1] "men" "man" "women" "maan" # Character classes - matching in a grouptest5 <- c("moon", "maan", "mn", "myn", "m_n", "mDn")# square brackets "[]" define a "character class":# only vowels located between m and ngrep("m[aeiou]*n", test5, value=TRUE)[1] "moon" "maan" "mn" # the "^" in a character class means NOT# anything but a vowel located between m and ngrep("m[^aeiou]*n", test5, value=TRUE)[1] "mn" "myn" "m_n" "mDn"# the "-" in a character class defines a range# only lowercase consonants located between m and ngrep("m[b-df-hj-np-tv-z]*n", test5, value=TRUE)[1] "mn" "myn"
There are also built-in pattern classes that will match multiple digits, words, or other pre-defined patterns.
888990919293


94959697


9899100101





102103104105



106107108109110


-111112113114115116


117118119120121

# Built-in classes: \d, \wsentence <- "pi is approximately equal to 3.14 or 22/7."# match numbers without escaping "."nlist_matches1 <- gregexpr("\\d+.\\d+", sentence)nlist1 <- regmatches(sentence, nlist_matches1)nlist1[[1]][1] "3.14" "22/7"
# match numbers while escaping "."nlist_matches2 <- gregexpr("\\d+\\.\\d+", sentence)nlist2 <- regmatches(sentence, nlist_matches2)nlist2[[1]][1] "3.14"
# list of all wordswlist_matches1 <- gregexpr("\\w+", sentence)wlist1 <- regmatches(sentence, wlist_matches1)wlist1[[1]] [1] "pi" "is" "approximately" [4] "equal" "to" "3" [7] "14" "Also" "22" [10] "7"
# words with only letterswlist_matches2 <- gregexpr("[[:alpha:]]+", sentence)wlist2 <- regmatches(sentence, wlist_matches2)wlist2[[1]][1] "pi" "is" "approximately"[4] "equal" "to" "Also"
# words starting with an Upper case letterwlist_matches3 <- gregexpr("[[:upper:]][[:alpha:]]+",   sentence)wlist3 <- regmatches(sentence, wlist_matches3)wlist3[[1]][1] "Also"
# Word boundaries# words starting with an "a"wlist_matches4 <- gregexpr("\\<(a|A)[[:alpha:]]+",   sentence)wlist4 <- regmatches(sentence, wlist_matches4)wlist4[[1]][1] "approximately" "Also"
# words ending in a vowelwlist_matches5 <- gregexpr("[[:alpha:]]+[aeiou]\\>",   sentence)wlist5 <- regmatches(sentence, wlist_matches5)wlist5[[1]][1] "pi" "to" "Also"


< Previous LessonNext Lesson >