Split string based on alternating character in R -


i'm trying figure out efficient way go splitting string like

"111110000011110000111000" 

into vector

[1] "11111" "00000" "1111" "0000" "111" "000" 

where "0" , "1" can alternating characters.

try

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=true)[[1]] #[1] "11111" "00000" "1111"  "0000"  "111"   "000"   

update

a modification of @rawr's solution stri_extract_all_regex

library(stringi) stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "1111"  "0000"  "111"   "000"     stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   #[10] "000"    stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]] #[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   #[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"      #[15] "d"       "aa"      "bb"      

benchmarks

library(stringi)  set.seed(24) x3 <- stri_rand_strings(1, 1e4)  akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]] #modified @thelatemail's function make bit more general thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,              perl=true))[[1]] rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=true)[[1]] ananda <- function() unlist(read.fwf(textconnection(x3),                  rle(strsplit(x3, "")[[1]])$lengths,                  colclasses = "character")) colonel <- function() with(rle(strsplit(x3,'')[[1]]),     mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))  cryo <- function(){    res_vector=rep(na_character_,nchar(x3))   res_vector[1]=substr(x3,1,1)   counter=1   old_tmp=''     (i in 2:nchar(x3)) {     tmp=substr(x3,i,i)     if (tmp==old_tmp) {     res_vector[counter]=paste0(res_vector[counter],tmp)     } else {     res_vector[counter+1]=tmp     counter=counter+1     }   old_tmp=tmp    }   res_vector[!is.na(res_vector)]   }    richard <- function(){      cs <- cumsum(      rle(stri_split_boundaries(x3, type = "character")[[1l]])$lengths    )    stri_sub(x3, c(1, head(cs + 1, -1)), cs)   }   nicola<-function(x) {    indices<-c(0,which(diff(as.integer(chartoraw(x)))!=0),nchar(x))    substring(x,indices[-length(indices)]+1,indices[-1])  }   richard2 <- function() {   cs <- cumsum(rle(strsplit(x3, null)[[1l]])[[1l]])   stri_sub(x3, c(1, head(cs + 1, -1)), cs)  }  system.time(akrun()) # user  system elapsed  # 0.003   0.000   0.003   system.time(thelate()) #   user  system elapsed  #  0.272   0.001   0.274   system.time(rawr()) # user  system elapsed  #  0.397   0.001   0.398   system.time(ananda()) #  user  system elapsed  # 3.744   0.204   3.949   system.time(colonel()) #   user  system elapsed  #  0.154   0.001   0.154   system.time(cryo()) #  user  system elapsed  # 0.220   0.005   0.226   system.time(richard()) #  user  system elapsed  # 0.007   0.000   0.006   system.time(nicola(x3)) # user  system elapsed  # 0.190   0.001   0.191  

on bigger string,

set.seed(24) x3 <- stri_rand_strings(1, 1e6)  system.time(akrun()) #user  system elapsed  #0.166   0.000   0.155  system.time(richard()) #  user  system elapsed  # 0.606   0.000   0.569  system.time(richard2()) #  user  system elapsed  # 0.518   0.000   0.487   system.time(colonel()) #  user  system elapsed  # 9.631   0.000   9.358    library(microbenchmark)  microbenchmark(richard(), richard2(), akrun(), times=20l, unit='relative')  #unit: relative  #     expr      min       lq     mean   median       uq      max neval cld  # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b  #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b  # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20   

note: tried run other methods, takes long time

data

str1 <- "111110000011110000111000" x1 <- "1111100000222000333300011110000111000" x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11daabb" 

Comments

Popular posts from this blog

javascript - AngularJS custom datepicker directive -

javascript - jQuery date picker - Disable dates after the selection from the first date picker -