October 4, 2019

R String Functions and stringr package

String Operations in R & stringr/wordcloud packages

str(object, max.level = NA, vec.len  = strO$vec.len, digits.d = strO$digits.d, nchar.max = 128, give.attr = TRUE, drop.deparse.attr = strO$drop.deparse.attr, give.head = TRUE, give.length = give.head, width = getOption("width"), nest.lev = 0, indent.str = paste(rep.int(" ", max(0, nest.lev + 1)), collapse = ".."), comp.str = "$ ", no.list = FALSE, envir = baseenv(), strict.width = strO$strict.width, formatNum = strO$formatNum, list.len = 99, …)
str(1:12)
str(.Machine, digits.d = 20)
str(lsfit(1:9, 1:9), width = 60, strict.width = "wrap")
str(longch, nchar.max = 52)
str(quote( { A+B; list(C, D) } ))
str(hist(islands, breaks = 12, plot =  FALSE))
strsplit(x, "e")
strsplit("", " ")[[1]]
strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)
strsplit(paste(c("", "a", ""), collapse="#"), split="#")[[1]]
strsplit(x, ". ", fixed=TRUE)
ch <- strtrim(paste(LETTERS, collapse="._"), 64)
strReverse(a)

noquote(my_string)
noquote(as.character(1:4))
noquote(strsplit("Text I want to display with spaces", NULL)[[1]])
toString(12.06)
toString(1:8)
toString(c("njour", 123, TRUE, NA, log(exp(1))))
toString(c("one", "two", "3333333333"), width = 12)

y = tolower(str)
toupper("ACdf")
capitalize("abcdef")
nchar(x, type = "chars", allowNA = FALSE, keepNA = NA)
nchar(variable)
nchar(char_vect, keepNA=FALSE)
nchar("Reenné", type = "bytes")
nzchar(x, keepNA = FALSE)

substr(x, start, stop)
substr(x, start, stop) <- value
substr("abcdef", 2, 4)
substr(rep("abcdef", 4), 1:4, 4:5)
substr(z, 2, 3) <- c("#", "@")
substring(text, first, last = 1000000L)
substring(text, first, last = 1000000L) <- value
substring("abcdef", 1:6, 1:6)
substring(x, 2, 4:6)
substring(x, 2) <- c("..", "+++")
substring(text, x, x + attr(x, "match.length") - 1)

sentence = gsub("[[:punct:]]", "", sentence)
sentence = gsub('\\d+', '', sentence)
matches = match(words, pos.words)
textcnt(string,n=1L, method="string")
casefold("aLL ChaRacterS in LoweR caSe")
colors11 = abbreviate(some_colors)
colors33 = abbreviate(some_colors, minlength = 3, method = "both.sides")
chartr("_", "-", data1$c) # character translation
chartr("a", "A", "This is boring string")
chartr("aei", "#!?", crazy)

grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE) # global regular expression print
grep("[a-z]", letters)
grep(pattern = "[0-9]", numerics, value = TRUE)
grep(pattern = "w", x = states, value = TRUE)
grep(pattern = "w", x = tolower(states), value = TRUE, ignore.case = TRUE)
? The preceding item is optional and will be matched at most once
* The preceding item will be matched zero or more times
+ The preceding item will be matched one or more times
{n} The preceding item is matched exactly n times
{n,} The preceding item is matched n or more times
{n,m} The preceding item is matched at least n times, but not more than m times
grep(pattern = "m{1}", people, value = TRUE, perl = FALSE)
grep(pattern = "m+.t", people, value = TRUE)
grepl(pattern, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) # grep logical
grepl("name", str)
grepl(pattern = email_pat, x = "satya@xyz.com")

sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) # substitue
sub("Satyaaa", "satya", str)
sub("\\(", "", "Peace(Love)")
sub(" +$", "", str)
\\b match a word boundary
\\B match a non-(word boundary)
\\d match a digit character
\\D match a non-digit character
\\h match a horizontal space
\\H match a non-horizontal space
\\s match a space character
\\S match a non-space character
\\w match a word character
\\W match a non-word character
\\v match a vertical space
\\V match a non-vertical space
sub("\\s+$", "", str, perl = TRUE)
sub("\\W", "_", "the dandelion war 2010")
sub(".*orange[ :]*([0-9]*).*", "\\1", df$fruit, ignore.case=TRUE)

gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) # global substitute
gsub("([ab])", "\\1_\\1_", "abc and ABC")
gsub("\\b(\\w)",    "\\U\\1", txt, perl=TRUE)
gsub("\\D", "_", "the dandelion dead dad")
mapping$category_list <- gsub("0", "na", mapping$category_list)
ma_frame$cat_list <- gsub("\\|.*", "", ma_frame$cat_list)
[[:lower:]] Lower-case letters
[[:upper:]] Upper-case letters
[[:alpha:]] Alphabetic characters ([[:lower:]] and [[:upper:]])
[[:digit:]] Digits: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
[[:alnum:]] Alphanumeric characters ([[:alpha:]] and [[:digit:]])
[[:blank:]] Blank characters: space and tab
[[:cntrl:]] Control characters
[[:punct:]] Punctuation characters: ! ” # % & ’ ( ) * + , - . / : ;
[[:space:]] Space characters: tab, newline, vertical tab, form feed, carriage return, and space
[[:xdigit:]] Hexadecimal digits: 0-9 A B C D E F a b c d e f
[[:print:]] Printable characters ([[:alpha:]], [[:punct:]] and space)
[[:graph:]] Graphical characters ([[:alpha:]] and [[:punct:]])
gsub(pattern = "[[:punct:]]", replacement = "", la_vie)
gsub(pattern = "[[:graph:]]", replacement = "", la_vie)
greplr(replacingpattern, df$column)

for(i in 1:length(my.lowercase)){ my.new.text<-gsub(my.lowercase[i],my.uppercase[i],my.new.text) }
regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
regexpr("en", txt)
gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
gregexpr("e", txt)
gregexpr(name.rex, notables, perl = TRUE)[[2]]
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE)
regexec(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
regmatches(x, m, invert = FALSE)
regmatches(x, m, invert = FALSE) <- value
m <- regexpr(pattern, x)
regmatches(x, m, invert = TRUE)
lengths(regmatches(mapping$category_list, gregexpr("0", mapping$category_lis)))

library("stringr")
str_c("111", "Then", "5", "B", "noooo", "str")
stringr::str_split(string=str1, pattern='-')
word.list = str_split(sentence, "\\s+")
str_split("my-name-is-sat",pattern="-")
str_split(flavors, "[aeiou]", n = 2)
first_last <- str_split_fixed("First Last", pattern = " ", n = 2)
split_prod <- str_split_fixed(sales$Cust.Name," ", 2)
s11 = str_split_fixed(ma_frame$cat_list, "[|]", 2)
stringr::str_sub(string=str1, start=1, end=3)
str_sub(hw, 1, 6)
str_sub(hw, c(1, 8), c(6, 14))
str_sub(rem, start = 1, end = 5)
str_sub(hw, end = -7)
str_sub(hw, pos[, 1], pos[, 2])
str_sub(hw, end = seq_len(str_length(hw)))
str_sub(x, -2, -2) <- "GHIJ"; x
str_sub(resto, start = -4, end = -1)
str_sub(rem, -seq_len(nchar(rem)))

strFound <- stringr::str_detect(string=df$col1, pattern=ignore.case('Sas'))
date_cols <- str_detect(names(sales5), "dt")
str_detect("vowel",pattern = "[aeiou]")
mapping_file$zeros <- str_detect(mapping_file$category_list, "0")
df[str_detect(df, "penny")]
str_extract("abcd00", pattern = "[A-z]{1,5}")
str_extract(shopping_list, "[a-z]{1,4}")
str_extract(bm$job, pattern = "[a-z]{1,4}")
str_extract(first_last, pattern = "[A-Z]{1}")
str_extract_all("91-999-888-7777",pattern = "[0-9]{1,10}")
str_extract_all(shopping_list, "\\b[a-z]+\\b")
str_extract_all("This is, my own, a biggg sentence.", boundary("word"))
str_match(strings, dates)
str_match_all(paris_tweets, "#[a-zA-Z]{1,}")
str_join("111", "Then", "5", "B", "noooo", "str", sep = "-")
str_count(bm$job, "teach")
str_count("Keep calm, quiet and code", pattern = "[aeiou]")
mapping_file$zeros <- str_count(mapping_file$category_list, "0")
str_length(some_text)

str_to_lower(sales$Prod.Name, locale = "en")
str_replace(string, pattern, replacement)
str_replace_all(string, pattern, replacement)
str_replace(fruits, "[aeiou]", "-")
str_replace(fruits, "[aeiou]", c("1", "2", "3"))
mapping_file$category_list <- str_replace(mapping_file$category_list, "[0]", "na")
str_replace_all(fruits, "b", NA_character_)
str_replace_all(fruits, "([aeiou])", "\\1\\1")
mapping$category_list <- str_replace_all(mapping$category_list, "0", "na")
churn$MonthlyCharges <- str_replace_all(churn$MonthlyCharges, "[$]", "")
titles10 = str_replace_all(titles10, pattern = "[[:punct:]]", "")
str_locate(paris_tweets, "#[a-zA-Z]{1,}")
pos <- str_locate_all(hw, "[aeio]")[[1]]
str_dup("cola", 3)
str_dup(words, 1:5)
str_pad("cola", width = 7)
str_pad("hashtag", width = 9, side = "both", pad = "-")
str_wrap(string, width = 80, indent = 0, exdent = 0)
cat(str_wrap(some_quote, width = 30, exdent = 3), "\n")
str_trim(bad_text, side = "left")
str_trim(bad_text, side = "both")
word(change, 1)
word(change, -1)
word(change, 2, -1)

library(wordcloud)

wordcloud(words,freq,scale=c(4,.5),min.freq=3,max.words=Inf, random.order=TRUE, random.color=FALSE, rot.per=.1, colors="black", ordered.colors=FALSE, use.r.layout=FALSE, fixed.asp=TRUE, ...)
wordcloud(c(letters, LETTERS, 0:9), seq(1, 1000, len = 62))
wordcloud(words, freq)
wordcloud(txt, seq(50, 400, len = 9))
wordcloud(unique_words, count_words, scale=c(8,.2), min.freq=6, max.words=Inf, random.order=FALSE, rot.per=.15)
cleaned_books %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
wordcloud(txt, seq(20, 100, len = length(txt)),c(5,.3))
wordcloud(d$word, d$freq, c(8,.3), 2,100, TRUE, ,.15, pal, vfont=c("script","plain"))
wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))
wordcloud (modi_data, scale=c(5,0.5), max.words=1, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, “Dark2″))
wordcloud(loan_descriptions.corpus, max.words = 100, random.order=FALSE, rot.per=0.30, use.r.layout=FALSE, colors=brewer.pal(8, "Paired"))
wordlayout(x, y, words, cex=1, rotate90 = FALSE, xlim=c(-Inf,Inf), ylim=c(-Inf,Inf), tstep=.1, rstep=.1, ...)
lay <- wordlayout(x,y,w,xlim=c(-3,3),ylim=c(-3,2))
text(lay[,1]+.5*lay[,3],lay[,2]+.5*lay[,4],w)
text(loc[,1],loc[,2],rownames(loc))
textplot(x, y, words, cex=1,new=TRUE, show.lines=TRUE, ...)
textplot(x,y,w)
textplot(loc[,1],loc[,2], rownames(loc),xlim=c(-3.5,3.5)) 
textplot(loc[,1],loc[,2], rownames(loc),cex=USArrests$UrbanPop/max(USArrests$UrbanPop))
data(SOTU)

library(wordcloud2)
names(wcdata) <- c("word", "freq")
wordcloud2(wcdata)

library(tidytext)
tidy_books <- original_books %>% unnest_tokens(word, text)
cleaned_books <- tidy_books %>%
  anti_join(stop_words)
cleaned_books %>% count(word, sort = TRUE) 

install.packages("tm")  # for text mining
docs <- Corpus(VectorSource(text))
lords <- Corpus (DirSource(“temp/”))
inspect(lords)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, removeWords, stopwords("english"))
modi_data<-tm_map(modi_data, removeNumbers)
jeopCorpus <- tm_map(jeopCorpus, removePunctuation)
dtm <- TermDocumentMatrix(docs)

library(tokenizers)
tokenize_characters(james)[[1]] 
tokenize_character_shingles(james, n = 3, n_min = 3, strip_non_alphanum = FALSE)[[1]][1:20]
tokenize_words(james)
tokenize_words(chunks[5:6])
tokenize_word_stems(james)
tokenize_tweets("Welcome, @user, to the tokenizers package. #rstats #forever")
tokenize_ptb(james) # Penn Treebank tokenizer
tokenize_sentences(james) 
tokenize_paragraphs(james)
chunks <- chunk_text(mobydick, chunk_size = 100, doc_id = "mobydick")

count_words(mobydick)
count_characters(mobydick)
count_sentences(mobydick)

library(stopwords)
tokenize_words("spell checkers are not nec", stopwords = stopwords::stopwords("en"))
tokenize_ngrams(james, n = 5, n_min = 2, stopwords = stopwords::stopwords("en"))
tokenize_skip_ngrams(james, n = 5, n_min = 2, k = 2, stopwords = stopwords::stopwords("en"))

install.packages("SnowballC") # for text stemming

require("NLP")
annotate(s, Maxent_Chunk_Annotator(), a3)
entity_annotator <- Maxent_Entity_Annotator()
annotate(s, entity_annotator, a2)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
parse_annotator <- Parse_Annotator()

Related Articles: Built-in Datasets in R  Using Databases in R


No comments:

Post a Comment