Subsetting Data R



  • 选列(选择变量)

    Selecting (Keeping) Variables

    # select variables v1, v2, v3
    myvars <- c("v1", "v2", "v3")
    newdata <- mydata[myvars]
    
    # another method
    myvars <- paste("v", 1:3, sep="")
    newdata <- mydata[myvars]
    
    # select 1st and 5th thru 10th variables
    newdata <- mydata[c(1,5:10)]
    

    Excluding (DROPPING) Variables

    # exclude variables v1, v2, v3
    myvars <- names(mydata) %in% c("v1", "v2", "v3")
    newdata <- mydata[!myvars]
    
    # exclude 3rd and 5th variable
    newdata <- mydata[c(-3,-5)]
    
    # exclude 3rd and 5th column
    newdata <- mydata[,-c(1,2)]
    
    # delete variables v3 and v5
    mydata$v3 <- mydata$v5 <- NULL
    

    选行(选择观察值(Obsrevations))

    # first 5 observations
    newdata <- mydata[1:5,]
    
    # based on variable values
    newdata <- mydata[ which(mydata$gender=='F' & mydata$age > 65), ]
    
    # exclude 3rd and 5th row
    newdata <- mydata[c(-3,-5),]
    
    # or
    attach(mydata)
    newdata <- mydata[ which(gender=='F' & age > 65),]
    detach(mydata)
    

    用subset同时选行和选列

    # using subset function
    newdata <- subset(mydata, age >= 20 | age < 10, select=c(ID, Weight))
    
    # using subset function, remove ID and Weight
    newdata <- subset(mydata, age >= 20 | age < 10, select=-c(ID, Weight))
    
    # using subset function (part 2)
    newdata <- subset(mydata, sex=="m" & age > 25,
    select=weight:income)
    

    Random Samples

    # take a random sample of size 50 from a dataset mydata
    # sample without replacement
    mysample <- mydata[sample(1:nrow(mydata), 50,
       replace=FALSE),]
    


  • 什么时候可以用 - 反选, 什么时候不能用 - 反选

    例子一:这个例子里面,我们选择了前三行作为测试集,用了后面的行作为训练集

    split <- c(1,2,3)
    train <-  mydata[split,]
    test <-  mydata[-split,]
    

    例子二:这个例子里面,我们选择了 ID < 50 作为训练集,其它为测试集

    split <- mydata$ID < 50
    # 这里split的结果是一个包含了True和False的数列!!!
    train <-  mydata[split,]
    test <-  mydata[!split,]
    

    下面的例子都是从网上找到的可以replicate的代码
    例子一:

    
    data(mtcars)
    
    ## 75% of the sample size
    smp_size <- floor(0.75 * nrow(mtcars))
    
    ## set the seed to make your partition reproducible
    set.seed(123)
    train_ind <- sample(seq_len(nrow(mtcars)), size = smp_size)
    #  注释:这里用sample生成的是从最大行数里面随便选的一些行数
    train <- mtcars[train_ind, ]
    test <- mtcars[-train_ind, ]
    

    例子二:

    n = nrow(dataset)
    split = sample(c(TRUE, FALSE), n, replace=TRUE, prob=c(0.75, 0.25))
    # 注释:这里用sample生成的是用True和False组成的,长度为原本数据集长度的变量
    training = dataset[split, ]
    testing = dataset[!split, ]
    

登录后回复