Creating New Variables

Loading our data:

if(!file.exists("./data")){dir.create("./data")}
fileURL <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileURL, destfile="./data/restaurants.csv", method="curl")
restData <- read.csv("./data/restaurants.csv")

Creating Sequences

Sometimes you need an index for your dataset

s1 <- seq(1,10, by=2); s1

## [1] 1 3 5 7 9

s2 <- seq(1,10, length=3); s2

## [1]  1.0  5.5 10.0

x <- c(1,3,8,25,100); seq(along=x)

## [1] 1 2 3 4 5

Subsetting variables

This will return TRUE if they two neighborhoods are found and false if not and create a new column to do it.

restData$nearMe = restData$neighborhood %in% c("Roland Park", "Homeland")
table(restData$nearMe)

## 
## FALSE  TRUE 
##  1314    13

Creating binary variables

This finds zipcodes < 0 and creates a column called zipWrong populating it with FALSE if the zipCode is >0 and TRUE if the value is < 0.

restData$zipWrong = ifelse(restData$zipCode < 0, TRUE, FALSE)
table(restData$zipWrong, restData$zipCode < 0)

##        
##         FALSE TRUE
##   FALSE  1326    0
##   TRUE      0    1

Creating New Categorical Variables

This splits the zipcodes into 4 groups, creates a new column, and assigns each row to one of the zipcode groups

restData$zipGroups = cut(restData$zipCode, breaks=quantile(restData$zipCode))
table(restData$zipGroups)

## 
## (-2.123e+04,2.12e+04]  (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] 
##                   337                   375                   282 
## (2.123e+04,2.129e+04] 
##                   332

This is an easier way to do the same thing with the cut2 function in Hmisc

library(Hmisc)

## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

restData$zipGroups = cut2(restData$zipCode, g=4)

table(restData$zipGroups, restData$zipCode)

##                 
##                  -21226 21201 21202 21205 21206 21207 21208 21209 21210
##   [-21226,21205)      1   136   201     0     0     0     0     0     0
##   [ 21205,21220)      0     0     0    27    30     4     1     8    23
##   [ 21220,21227)      0     0     0     0     0     0     0     0     0
##   [ 21227,21287]      0     0     0     0     0     0     0     0     0
##                 
##                  21211 21212 21213 21214 21215 21216 21217 21218 21220
##   [-21226,21205)     0     0     0     0     0     0     0     0     0
##   [ 21205,21220)    41    28    31    17    54    10    32    69     0
##   [ 21220,21227)     0     0     0     0     0     0     0     0     1
##   [ 21227,21287]     0     0     0     0     0     0     0     0     0
##                 
##                  21222 21223 21224 21225 21226 21227 21229 21230 21231
##   [-21226,21205)     0     0     0     0     0     0     0     0     0
##   [ 21205,21220)     0     0     0     0     0     0     0     0     0
##   [ 21220,21227)     7    56   199    19    18     0     0     0     0
##   [ 21227,21287]     0     0     0     0     0     4    13   156   127
##                 
##                  21234 21237 21239 21251 21287
##   [-21226,21205)     0     0     0     0     0
##   [ 21205,21220)     0     0     0     0     0
##   [ 21220,21227)     0     0     0     0     0
##   [ 21227,21287]     7     1     3     2     1

Creating a factor variable

restData$zcf <- factor(restData$zipCode)
restData$zcf[1:10]

##  [1] 21206 21231 21224 21211 21223 21218 21205 21211 21205 21231
## 32 Levels: -21226 21201 21202 21205 21206 21207 21208 21209 ... 21287

class(restData$zcf)

## [1] "factor"

Levels of factor variables

yesno <- sample(c("yes", "no"), size=10, replace=TRUE)

normally factor will make the first word alphabetically the first factor but this can be changed by adding levels = c() and listing the factors

yesnofac = factor(yesno, levels=c("yes", "no")) 
relevel(yesnofac, ref="yes")

##  [1] no  no  yes no  no  no  no  yes yes no 
## Levels: yes no

as.numeric(yesnofac)

##  [1] 2 2 1 2 2 2 2 1 1 2

relevel(yesnofac, ref="no")

##  [1] no  no  yes no  no  no  no  yes yes no 
## Levels: no yes

as.numeric(yesnofac) # calls lowest value 1 and next value 2

##  [1] 2 2 1 2 2 2 2 1 1 2

yesnofac = factor(yesno, levels=c("no", "yes")) 
as.numeric(yesnofac)

##  [1] 1 1 2 1 1 1 1 2 2 1

Cutting produces factor variables

library(Hmisc)
restData$zipGroups = cut2(restData$zipCode,g=4)
table(restData$zipGroups)

## 
## [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287] 
##            338            375            300            314

Using mutate

create a new version of a variable and simultaneously add it to a dataset

library(Hmisc); library(plyr)

## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize

# this creates a new data.frame. Using mutate it adds a new variable called zipgroups
# which is a function of zipCode cut into 4 groups.
restData2 = mutate(restData, zipGroups=cut2(zipCode, g=4))
table(restData2$zipGroups)

## 
## [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287] 
##            338            375            300            314

These are some Common transform functions

abs(x)

sqrt(x)

ceiling(x)

floor(x)

round(x, digits = n)

signif(x, digits=n)

cos(x); sin(x)

log(x)

log2(x), log10(x)

exp (x)

More can be found here: