Split string based on alternating character in R -
i'm trying figure out efficient way go splitting string like
"111110000011110000111000"
into vector
[1] "11111" "00000" "1111" "0000" "111" "000"
where "0" , "1" can alternating characters.
try
strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=true)[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000"
update
a modification of @rawr's solution stri_extract_all_regex
library(stringi) stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000" stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" #[10] "000" stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]] #[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" #[8] "00000" "222" "aaa" "bb" "cc" "d" "11" #[15] "d" "aa" "bb"
benchmarks
library(stringi) set.seed(24) x3 <- stri_rand_strings(1, 1e4) akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]] #modified @thelatemail's function make bit more general thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, perl=true))[[1]] rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=true)[[1]] ananda <- function() unlist(read.fwf(textconnection(x3), rle(strsplit(x3, "")[[1]])$lengths, colclasses = "character")) colonel <- function() with(rle(strsplit(x3,'')[[1]]), mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values)) cryo <- function(){ res_vector=rep(na_character_,nchar(x3)) res_vector[1]=substr(x3,1,1) counter=1 old_tmp='' (i in 2:nchar(x3)) { tmp=substr(x3,i,i) if (tmp==old_tmp) { res_vector[counter]=paste0(res_vector[counter],tmp) } else { res_vector[counter+1]=tmp counter=counter+1 } old_tmp=tmp } res_vector[!is.na(res_vector)] } richard <- function(){ cs <- cumsum( rle(stri_split_boundaries(x3, type = "character")[[1l]])$lengths ) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } nicola<-function(x) { indices<-c(0,which(diff(as.integer(chartoraw(x)))!=0),nchar(x)) substring(x,indices[-length(indices)]+1,indices[-1]) } richard2 <- function() { cs <- cumsum(rle(strsplit(x3, null)[[1l]])[[1l]]) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } system.time(akrun()) # user system elapsed # 0.003 0.000 0.003 system.time(thelate()) # user system elapsed # 0.272 0.001 0.274 system.time(rawr()) # user system elapsed # 0.397 0.001 0.398 system.time(ananda()) # user system elapsed # 3.744 0.204 3.949 system.time(colonel()) # user system elapsed # 0.154 0.001 0.154 system.time(cryo()) # user system elapsed # 0.220 0.005 0.226 system.time(richard()) # user system elapsed # 0.007 0.000 0.006 system.time(nicola(x3)) # user system elapsed # 0.190 0.001 0.191
on bigger string,
set.seed(24) x3 <- stri_rand_strings(1, 1e6) system.time(akrun()) #user system elapsed #0.166 0.000 0.155 system.time(richard()) # user system elapsed # 0.606 0.000 0.569 system.time(richard2()) # user system elapsed # 0.518 0.000 0.487 system.time(colonel()) # user system elapsed # 9.631 0.000 9.358 library(microbenchmark) microbenchmark(richard(), richard2(), akrun(), times=20l, unit='relative') #unit: relative # expr min lq mean median uq max neval cld # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
note: tried run other methods, takes long time
data
str1 <- "111110000011110000111000" x1 <- "1111100000222000333300011110000111000" x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11daabb"
Comments
Post a Comment