Subsetting Data R
-
选列(选择变量)
Selecting (Keeping) Variables
# select variables v1, v2, v3 myvars <- c("v1", "v2", "v3") newdata <- mydata[myvars] # another method myvars <- paste("v", 1:3, sep="") newdata <- mydata[myvars] # select 1st and 5th thru 10th variables newdata <- mydata[c(1,5:10)]
Excluding (DROPPING) Variables
# exclude variables v1, v2, v3 myvars <- names(mydata) %in% c("v1", "v2", "v3") newdata <- mydata[!myvars] # exclude 3rd and 5th variable newdata <- mydata[c(-3,-5)] # exclude 3rd and 5th column newdata <- mydata[,-c(1,2)] # delete variables v3 and v5 mydata$v3 <- mydata$v5 <- NULL
选行(选择观察值(Obsrevations))
# first 5 observations newdata <- mydata[1:5,] # based on variable values newdata <- mydata[ which(mydata$gender=='F' & mydata$age > 65), ] # exclude 3rd and 5th row newdata <- mydata[c(-3,-5),] # or attach(mydata) newdata <- mydata[ which(gender=='F' & age > 65),] detach(mydata)
用subset同时选行和选列
# using subset function newdata <- subset(mydata, age >= 20 | age < 10, select=c(ID, Weight)) # using subset function, remove ID and Weight newdata <- subset(mydata, age >= 20 | age < 10, select=-c(ID, Weight)) # using subset function (part 2) newdata <- subset(mydata, sex=="m" & age > 25, select=weight:income)
Random Samples
# take a random sample of size 50 from a dataset mydata # sample without replacement mysample <- mydata[sample(1:nrow(mydata), 50, replace=FALSE),]
-
什么时候可以用 - 反选, 什么时候不能用 - 反选
例子一:这个例子里面,我们选择了前三行作为测试集,用了后面的行作为训练集
split <- c(1,2,3) train <- mydata[split,] test <- mydata[-split,]
例子二:这个例子里面,我们选择了 ID < 50 作为训练集,其它为测试集
split <- mydata$ID < 50 # 这里split的结果是一个包含了True和False的数列!!! train <- mydata[split,] test <- mydata[!split,]
下面的例子都是从网上找到的可以replicate的代码
例子一:data(mtcars) ## 75% of the sample size smp_size <- floor(0.75 * nrow(mtcars)) ## set the seed to make your partition reproducible set.seed(123) train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size) # 注释:这里用sample生成的是从最大行数里面随便选的一些行数 train <- mtcars[train_ind, ] test <- mtcars[-train_ind, ]
例子二:
n = nrow(dataset) split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.75, 0.25)) # 注释:这里用sample生成的是用True和False组成的,长度为原本数据集长度的变量 training = dataset[split, ] testing = dataset[!split, ]