May 18, 2019

R Data Frames

Data Frames in R

Data Frames can have different datatypes along different columns, while in a matrix all columns have the same datatype.
Can hold numeric, character or logical values. Within a column all elements have same data type, but different columns can be of different data type.

data.frame(…, row.names = NULL, check.rows = FALSE, check.names = TRUE, fix.empty.names = TRUE, stringsAsFactors = default.stringsAsFactors())
data.frame(x=1:3, y=4:6)+2
x <- data.frame("SN" = 1:2, "Age" = c(21,15), "Name" = c("Peppa", "Dora"))
team1 <- data.frame(Player_name,Total_runs, Strike_rate)
score.df = data.frame(text=sentences, score=scores)
team_data_frame <- data.frame(Player_name, Total_runs, Strike_rate, stringsAsFactors = FALSE)
data.frame(x=1:3, y=c("A","B","C"))+2
data.frame(1, 1:10, sample(L3, 10, replace = TRUE))
R_data_frame = data.frame(int = rgeom(24, prob = 0.5), num = runif(24))
subset1 <- data.frame(bank$age, bank$salary, bank$y)
my.data.set <- as.data.frame(matrix(rnorm(10*200), nrow=10))
dd <- transform(data.frame(x, y, z), z = factor(z, labels = LETTERS[9:1]))
df = transform(df, FOO = colsplit(FOO, split = "\\|", names = c('a', 'b')))
my.data.frame <- data.frame(x=rnorm(10), y=rnorm(10)+5, z=rchisq(10,1))
merge(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x",".y"), incomparables = NULL, ...)
data3 <- merge(data1, data2)
df_merge <- merge(market_fact, cust_dimen, by="Cust_id")
df3 <- merge(df1, df2, by.x="name", by.y="names")
df3 <- merge(df1, df2, by.x="M", by.y="ID", all.x=FALSE); df3
merge(x, y, by = c("k1","k2"))

is.data.frame(d)
View(df) # View dataframe
nrow(my.dataset)                        # Returns number of rows
ncol(data.set.2)                        # Returns number of columns
summary(team1_info)                     # Summary analysis of data
summary(attenu, digits = 4)
names(df) # Names of parameters/columns
typeof(df)
typeof(airquality$Temp)
class(df)
length(airquality)

head(x, n = 6L, …) # First 6 observations - top part of dataframe
head(iris)
head(trees, n=3)
head(letters, n = -5L)
head(df, 33)
head(stats::ftable(Titanic))
tail(x, n = 6L, addrownums = TRUE, …) # Last 6 observations - bottom part of dataframe
tail(iris)
tail(letters, n = -8L)
tail(df, 15)
data(airquality)
edit(InsectSprays)
edit(InsectSprays, factor.mode = "numeric")
gold_silver <- edit(gold_silver)
edit.data.frame(name, factor.mode=c("numeric", "character"))
fix(gold_silver) # equivalent to x <- edit(x)
df$Assay <- replace(df$Assay, df$Assay=="CCT6-18", "CCT6-018")
attributes(mydata)
attr(object, name)
attr(mydata, "row.names") <- c("pointone","pointtwo","pointthree","pointfour","pointfive")

str(iris)      # structure
str(bank$marital)
mt_str <- capture.output(str(mtcars))
unique(hflights$CancellationCode)
dni3 <- dimnames(iris3)
dim(df1)
rownames(df2)
rownames(df3) <- c(...)
colnames(df)
colnames(internet)[which(colnames(internet)=="X")] <- "internet_ser"
data_frame <- setNames(data_frame, c("premium","change","newprice"))

indexing with [ will return a data frame
Accessing with [[ or $ is similar will return a vector

team[3, 2]                    # Accessing one element
team1["4","Player_name"]  
teams[2, ]                    # Accessing second row
data_frame_r[ ,3]                    # Accessing third column
team2[ ,"Runs"]            # Accessing column using column name
team1$Total_runs        # Accessing column using column name
team9$Total_runs[999]
df.data.frame[2:3, 1:2]        # Accessing selective rows & columns
team2[1:3, "Total_runs"]
data.frame3[c(1,3), ] # 1st, 3rd rows
data.frame1[c(2,4), c("col44", "col33")]
my_df[, -4]      # Omit fourth column of my_df
my_df[-(1:5), ] # Omit first 5 rows of my_df
sales2[, 5:(ncol(sales2) - 15)]
ds[["Name"]]
ds[[3]]
trees[trees$Height > 82,]    # selects rows with Height greater than 82
plantdata_Q <- plantdata[plantdata$Type == "Quebec", ]
trees[10:12, 2, drop = FALSE]
airquality[airquality$Temp>60, c("Ozone","Temp")]
bollywood[order(bollywood$Total_collection, decreasing=TRUE),]

subset(x, subset, select, drop = FALSE, …)
subset(planets_df, subset = rings)
subset(airquality, Temp > 80, select = c(Ozone, Temp))
subset(aq, Day==1, select=-Temp)
subset(state.x77, grepl("^M", nm), Illiteracy:Murder)
subset(mtcars, select = -c(hp, drat, wt, qsec) )
subset(products_sold, !is.na(inventory), select=-product)
data.frame.R <- subset(airquality, !is.na(Ozone))
setosa.data <- subset(iris, Species=="setosa" & Sepal.Length < median(Sepal.Length), select=-Species)
slice <- subset(merge4, merge4$Ship_Mode=="EXPRESS AIR")
dice <- subset(merge3, merge3$Region=="ATLANTIC" & merge3$Order_Priority="MEDIUM")
data.set2 <- subset(data, Publisher %in% c("ARI", "ESA"))

sum(is.na(airquality))
length(is.na(airquality))
sort(df$col9)
order(r.data.frame$col.name)
max(airquality[,4])
which.max(airquality[,4])
which.min(airquality[,2])
which(is.na(airquality$Solar.R)== TRUE)
mean(airquality$Solar.R, na.rm = TRUE)
median(df[[i]])
quantile(df)
seq_along(df)
colMeans(df)
log(df)
duplicated(customer$customerID)
sum(duplicated(rounds2$company_permalink)) # number of duplicates
na.omit (df)
na.exclude(data.set9)
na.fail(g)
na.pass(g)
x <- df[complete.cases(df), ]
df[complete.cases(df[, 5:6]),]
df %>% drop_na () # library (tidyr)
dplyr::filter(df, !is.na(columnname))
df [ !(is.na(df$column1)) | !(is.na(df$column2))]
str_count(grades$submission, ".zip")

Complete_team <- rbind(team1, team2)      # Combining/row bind two data frames
team_info <<- cbind(team1, team3)              # Combining/column bind data frames
tea_info <<- cbind(tea, Player_age, hit_six) # Column bind two vectors directly
sum(Complete_team$Total_runs)

df$VAR3[1] <- 7.996
x[1,"Age"] <- 20
x$State <- c("NY", "FL")
rbind(x, list(1,16, "Paul"))
cbind(x, State=c("NY", "FL"))
x$State <- NULL # removing column from data frame
x <- x[-1,]
pp_data_frame[["height"]] <- height_vector
my.data2 <- my.data*10e8
fdata$name <- as.character(fdata$name)
fdata[which(is.na(fdata$name)), 1] <- "Satya"
anscombe <- within(anscombe, {y1[1:3] <- NA})
data$Collapsed.titles[data$Collapsed.titles %in% c("GEB", "GCB", "Ecol", "JE")] <- "Next4"

scale(x, center = TRUE, scale = TRUE)
(centered.x <- scale(x, scale = FALSE))
cov(centered.scaled.x <- scale(x))

sqldf package in R
library(sqldf)
sqldf(x, stringsAsFactors = FALSE, row.names = FALSE, envir = parent.frame(), method = getOption("sqldf.method"), file.format = list(), dbname, drv = getOption("sqldf.driver"), user, password = "", host = "localhost", port, dll = getOption("sqldf.dll"), connection = getOption("sqldf.connection"), verbose = isTRUE(getOption("sqldf.verbose")))
sqldf("SELECT price, volume FROM df")
a1s <- sqldf("select * from warpbreaks limit 6")
SELECT *, price * volume AS totalValue FROM df ORDER BY totalValue DESC LIMIT 30
SELECT *, price * volume AS totalValue FROM df WHERE price >= (SELECT AVG(price) FROM df) ORDER BY totalValue DESC LIMIT 100
read.csv.sql(file, sql = "select * from file", header = TRUE, sep = ",", row.names, eol, skip, filter, nrows, field.types, colClasses, dbname = tempfile(), drv = "SQLite", ...)
read.csv2.sql(file, sql = "select * from file", header = TRUE, sep = ";", row.names, eol, skip, filter, nrows, field.types, colClasses, dbname = tempfile(), drv = "SQLite", ...)
as <- sqldf("select * from warpbreaks where breaks between 20 and 30", row.names = TRUE)
ad <- sqldf('select Species, avg("Sepal.Length") `Sepal.Length`, avg("Sepal.Width") `Sepal.Width` from iris group by Species')


Related Articles: R Matrices   Vectors in R

No comments:

Post a Comment