廿TT

譬如水怙牛過窓櫺 頭角四蹄都過了 因甚麼尾巴過不得

R: apply系の関数の代わりにpurrrを使ってみる

今日の川柳

apply

行列の行ごとのトリム平均を求める.

applyで書くとこう.

rowmean1 <-apply(x, 1, mean, trim = 0.2)

purrrのmap関数を使うとこう.

rowmean2 <-map(array_branch(x,1),mean,trim=0.2)

ベンチマークしてみるとpurrrのほうがほんのすこしはやいです.

library(purrr)
library(rbenchmark)
set.seed(1)
x <- matrix(rnorm(1000),100,100)
test1 <-benchmark(rowmean1 <-apply(x, 1, mean, trim = 0.2),
          rowmean2 <-map(array_branch(x,1),mean,trim=0.2))
> t(test1)
             1                                          
test         "rowmean1 <- apply(x, 1, mean, trim = 0.2)"
replications "100"                                      
elapsed      "0.351"                                    
relative     "1.048"                                    
user.self    "0.341"                                    
sys.self     "0.010"                                    
user.child   "0"                                        
sys.child    "0"                                        
             2                                                      
test         "rowmean2 <- map(array_branch(x, 1), mean, trim = 0.2)"
replications "100"                                                  
elapsed      "0.335"                                                
relative     "1.000"                                                
user.self    "0.327"                                                
sys.self     "0.006"                                                
user.child   "0"                                                    
sys.child    "0"

結果はいっしょです.

> all(rowmean1==rowmean2)
[1] TRUE

sapply

リストの各要素ごとのトリム平均を求める.

set.seed(2)
xlist <- map(1:100,function(i)rnorm(100))
test2 <- benchmark(emean1 <-sapply(xlist,mean,trim=0.2),
                   emean2 <-map_dbl(xlist,mean,trim=0.2))

ベンチマークしてみるとpurrrのほうがほんのすこしはやいです.

> t(test2)
             1                                          
test         "emean1 <- sapply(xlist, mean, trim = 0.2)"
replications "100"                                      
elapsed      "0.297"                                    
relative     "1.061"                                    
user.self    "0.289"                                    
sys.self     "0.006"                                    
user.child   "0"                                        
sys.child    "0"                                        
             2                                           
test         "emean2 <- map_dbl(xlist, mean, trim = 0.2)"
replications "100"                                       
elapsed      "0.280"                                     
relative     "1.000"                                     
user.self    "0.276"                                     
sys.self     "0.004"                                     
user.child   "0"                                         
sys.child    "0"        

結果はいっしょです.

> all(emean1==emean2)
[1] TRUE

mapply

二項検定.

set.seed(3)
n<-rpois(100,100)
y<-rbinom(100,n,0.5)
test3 <- benchmark(res1 <-mapply(function(y,n){binom.test(y,n)},y,n),
                   res2 <-map2(y,n,function(y,n){binom.test(y,n)}))

purrrのほうがほんのすこしはやいです.

> t(test3)
             1                                                                
test         "res1 <- mapply(function(y, n) {\n    binom.test(y, n)\n}, y, n)"
replications "100"                                                            
elapsed      "1.343"                                                          
relative     "1.091"                                                          
user.self    "1.329"                                                          
sys.self     "0.008"                                                          
user.child   "0"                                                              
sys.child    "0"                                                              
             2                                                              
test         "res2 <- map2(y, n, function(y, n) {\n    binom.test(y, n)\n})"
replications "100"                                                          
elapsed      "1.231"                                                        
relative     "1.000"                                                        
user.self    "1.228"                                                        
sys.self     "0.003"                                                        
user.child   "0"                                                            
sys.child    "0"        

結果はいっしょですが, 帰ってくるときの形式が違います.

all(unlist(res1["p.value",])==map_dbl(res2,function(x)x$p.value))

mapplyのほうはこんな感じ.

> res1[,1]
$statistic
number of successes 
                 47 

$parameter
number of trials 
              90 

$p.value
[1] 0.7520332

$conf.int
[1] 0.4142679 0.6286733
attr(,"conf.level")
[1] 0.95

$estimate
probability of success 
             0.5222222 

$null.value
probability of success 
                   0.5 

$alternative
[1] "two.sided"

$method
[1] "Exact binomial test"

$data.name
[1] "y and n"

map2のほうはこんな感じです.

> res2[[1]]

	Exact binomial test

data:  y and n
number of successes = 47, number of trials = 90, p-value = 0.752
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
 0.4142679 0.6286733
sample estimates:
probability of success 
             0.5222222

tapply

dplyrのgroup_byとsummarizeを使えばいいかな, という気がします.

sweep

行列の各行から行平均を引く.

sweep(x,1,rowmean1)

これはpurrrでどう書けばいいかわからなかった.