Skip to content

Instantly share code, notes, and snippets.

We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 10.
"STATION","NAME","DATE","AWND","PRCP","SNOW","SNWD","TAVG","TMAX","TMIN","WT01","WT02","WT03","WT04","WT06","WT08"
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-01","7.61","0.00","0.0","0.0",,"42","34",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-02","6.71","0.00","0.0","0.0",,"40","32",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-03","7.83","0.00","0.0","0.0",,"45","35",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-04","8.50","0.00","0.0","0.0",,"36","14",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-05","5.14","0.00","0.0","0.0",,"29","11",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-06","3.58","0.00","0.0","0.0",,"41","25",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-07","2.46","0.00","0.0","0.0",,"46","31",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-08","4.25","0.00","0.0","0.0",,"46","31",,,,,,
"USW00094728","NY CITY CENTRAL PARK, NY US","2016-01-09","8.05","0.00","0.0","0.0",,"47","40"," 1",,,,,"
<?xml version="1.0" encoding="UTF-8"?>
<!-- File generated by Arc2Earth (http://www.Arc2Earth.com) on 9/15/2006 9:59 AM -->
<kml xmlns="http://earth.google.com/kml/2.2">
<Document>
<name><![CDATA[US States]]></name>
<open>1</open>
<Style id="Style_5">
<IconStyle>
<scale>0.4</scale>
<Icon>
This file has been truncated, but you can view the full file.
{ "DateTime": "2016/01/01 00:30:04.91", "Latitude": 18.0772, "Longitude": -67.1027, "Depth": 19.91, "Magnitude": 2.8, "MagType": "Md", "NbStations": null, "Gap": 125, "Distance": 0, "RMS": 0.44, "Source": "pr", "EventID": 201601012001 }
{ "DateTime": "2016/01/01 00:30:58.02", "Latitude": 37.4485, "Longitude": -115.9455, "Depth": 12.31, "Magnitude": 1.27, "MagType": "ML", "NbStations": 26, "Gap": 150, "Distance": 28, "RMS": 0.2, "Source": "NN", "EventID": 524900 }
{ "DateTime": "2016/01/01 00:37:54.29", "Latitude": 61.531, "Longitude": -152.36, "Depth": 4.94, "Magnitude": 3.43, "MagType": "ML", "NbStations": 29, "Gap": 183, "Distance": 28, "RMS": 0.17, "Source": "AV", "EventID": 61148721 }
{ "DateTime": "2016/01/01 00:37:58.70", "Latitude": 42.5138, "Longitude": 142.7227, "Depth": 89.9, "Magnitude": 4, "MagType": "Mb", "NbStations": null, "Gap": 109, "Distance": 1, "RMS": 0.81, "Source": "us", "EventID": 201601012003 }
{ "DateTime": "2016/01/01 02:00:39.95", "Latitude": -50.5575, "Longitude": 139.4489, "Depth"
SELECT A AS host,
CASE WHEN length(substr(D, 2, 20)) > 0 THEN substr(D, 2, 20) ELSE NULL END AS request_time,
F AS request,
CONVERT_TO_INTEGER(G, 1, 0, 0) AS status,
CONVERT_TO_INTEGER(H, 1, 0, 0) AS bytes
FROM apache-logs.nasa
WHERE is_convertible_data(G, 1, 'INTEGER')
# Root logger option
log4j.rootLogger=ERROR, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
val dailyHosts = days.groupBy("month", "day").count()
val totalReqPerDay = df.select(
month(col("time")).alias("month"),
dayofmonth(col("time")).alias("day")
).groupBy("month", "day").count()
val grouping = JavaConversions.asScalaBuffer(listOf("month", "day")).toSeq()
totalReqPerDay.join(dailyHosts, grouping)
.select(
val days = df.select(
col("host"),
month(col("time")).alias("month"),
dayofmonth(col("time")).alias("day")
).distinct()
days.groupBy("month", "day")
.count()
.sort(desc("count"))
.show()
df.select("host")
.distinct()
.count()
df.filter(col("status").equalTo(404))
.groupBy("resource")
.count()
.sort(desc("count"))
.show(10)
df.groupBy("host")
.count()
.filter(col("count").gt(700))
.sort(col("count").desc())
.show()