str_dataset = LOAD '/user/root/projects/structuredFlightDataset/part-m-00000' USING PigStorage(',') AS (uid:int,FL_DATE:chararray,OP_UNIQUE_CARRIER:chararray,ORIGIN_AIRPORT_ID:int,ORIGIN_AIRPORT_SEQ_ID:int,ORIGIN_CITY_MARKET_ID:int,ORIGIN_CITY_NAME:chararray,DEST_AIRPORT_ID:int,DEST_AIRPORT_SEQ_ID:int,DEST_CITY_MARKET_ID:int,DEST_CITY_NAME:chararray,DEP_TIME:chararray,ARR_TIME:chararray);
str_dataset_filtered = FILTER str_dataset BY ORIGIN_CITY_NAME IN (''Atlanta',''Nashville',''Baltimore',''Dallas',''Houston');
- Load & filter the data as before
strfil_dataset_fewcols = FOREACH str_dataset_filtered GENERATE uid, OP_UNIQUE_CARRIER, ORIGIN_CITY_NAME;
- "FOREACH" & "GENERATE" used together to select only specific columns of interest
- "GROUP BY" a specific value
- "STORE" to push the data back into HDFS