We do subsetting for data management and ff, ffbase packages support subsetting of ffdf objects through the subset.ffdf() method.
#Install the libraries and call them
library(ff)
library(ffbase)
#Load the package and create directory where data will be stored
system("mkdir ffdf")
## [1] 1
#indicate the path to the newly created directory
dir_air=paste0(getwd(),"/ffdf")
dir_air
## [1] "C:/Users/Mr.Semicolon/Desktop/R 1/Student/ffdf"
options(fftempdir = dir_air)
#Now I can import airline data set
airline.ff<- read.table.ffdf(file="flights_sep_oct15.txt",
sep=",", VERBOSE=TRUE,
header=TRUE, next.rows=100000, colClasses=NA)
## read.table.ffdf 1..100000 (100000) csv-read=0.56sec ffdf-write=0.31sec
## read.table.ffdf 100001..200000 (100000) csv-read=0.8sec ffdf-write=0.22sec
## read.table.ffdf 200001..300000 (100000) csv-read=0.56sec ffdf-write=0.22sec
## read.table.ffdf 300001..400000 (100000) csv-read=0.56sec ffdf-write=0.24sec
## read.table.ffdf 400001..500000 (100000) csv-read=0.64sec ffdf-write=0.21sec
## read.table.ffdf 500001..600000 (100000) csv-read=0.69sec ffdf-write=0.25sec
## read.table.ffdf 600001..700000 (100000) csv-read=0.63sec ffdf-write=0.2sec
## read.table.ffdf 700001..800000 (100000) csv-read=0.56sec ffdf-write=0.25sec
## read.table.ffdf 800001..900000 (100000) csv-read=0.53sec ffdf-write=0.27sec
## read.table.ffdf 900001..951111 (51111) csv-read=0.3sec ffdf-write=0.18sec
## csv-read=5.83sec ffdf-write=2.35sec TOTAL=8.18sec
#See the number of columns and rows
dim(airline.ff)
## [1] 951111 28
#subset all records were not canceled and origin flight date, airline id, city, and destination city.
airline_subset.ff <- subset.ffdf(airline.ff, CANCELLED == 0,
select = c(FL_DATE, AIRLINE_ID,
ORIGIN_CITY_NAME,
DEST_CITY_NAME))
dim(airline_subset.ff)
## [1] 946582 4