Skip to content

Instantly share code, notes, and snippets.

@leovegas
Created October 14, 2020 17:16
Show Gist options
  • Save leovegas/8aff444c88792139093cd296e05ed47a to your computer and use it in GitHub Desktop.
Save leovegas/8aff444c88792139093cd296e05ed47a to your computer and use it in GitHub Desktop.
weather-web-parser
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.main</groupId>
<artifactId>weather-parser</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>libs/</classpathPrefix>
<mainClass>
com.company.Main
</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.8</version>
</dependency>
</dependencies>
</project>
package com.company;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class FunnyCrawler {
Connection.Response res1 = null;
private String outFileName;
private static Pattern patternDomainName;
private static final String DOMAIN_NAME_PATTERN
= "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}";
static {
patternDomainName = Pattern.compile(DOMAIN_NAME_PATTERN);
}
public FunnyCrawler(String outFileName) {
this.outFileName = outFileName;
}
private ArrayList<String> getDateList(List<Element> hours) {
ArrayList<String> result = new ArrayList<String>();
long timeMillis = System.currentTimeMillis();
Date date = new Date(timeMillis);
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd-hh aa");
for (int i = 0; i < hours.size(); i++) {
result.add(formatter.format(date));
if (i > 10) {
if (Integer.parseInt(hours.get(i).text().substring(0, 2)) == 10 && Integer.parseInt(hours.get(i - 1).text().substring(0, 2)) == 7) {
timeMillis += 3600000 * 3;
} else if (Integer.parseInt(hours.get(i).text().substring(0, 2)) == 1 && Integer.parseInt(hours.get(i - 1).text().substring(0, 2)) == 10) {
timeMillis += +3600000 * 3;
} else if (Integer.parseInt(hours.get(i).text().substring(0, 2)) == 4 && Integer.parseInt(hours.get(i - 1).text().substring(0, 2)) == 1) {
timeMillis += +3600000 * 3;
} else if (Integer.parseInt(hours.get(i).text().substring(0, 2)) == 7 && Integer.parseInt(hours.get(i - 1).text().substring(0, 2)) == 4) {
timeMillis += +3600000 * 3;
} else timeMillis += 3600000;
} else
timeMillis += 3600000;
date = new Date(timeMillis);
}
return result;
}
private void generateCSV(List<String> values) {
BufferedWriter writer = null;
try {
FileWriter fileWriter = new FileWriter(new File(outFileName), true);
writer = new BufferedWriter(fileWriter);
} catch (IOException e) {
e.printStackTrace();
}
CSVPrinter csvPrinter = null;
try {
if (writer != null) {
csvPrinter = new CSVPrinter(writer, CSVFormat.MYSQL
.withAllowMissingColumnNames());
}
} catch (IOException e) {
e.printStackTrace();
}
{
try {
if (csvPrinter != null) {
csvPrinter.printRecords(values);
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (csvPrinter != null) {
csvPrinter.flush();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public ArrayList<String> getData() {
ArrayList<String> results = new ArrayList<String>();
String request = "https://weather.us/forecast/2643743-london/xltrend/euro";
String request2 = "https://weather.us/ajax_pub/fcxl?city_id=2643743&lang=en&units=us&tf=0&mos_station_id=&model=euro&func=xltrend";
System.out.println("Sending request..." + request);
// need http protocol, set this as a Google bot agent :)
// WebDriver driver = new ChromeDriver();
// driver.get(request);
// Document doc = Jsoup.parse(driver.getPageSource());
// HtmlPage myPage = null;
// WebClient webClient2 = new WebClient();
// webClient2.setJavaScriptTimeout(10000);
// webClient2.addWebWindowListener(new WebWindowListener() {
// @Override
// public void webWindowOpened(WebWindowEvent webWindowEvent) {
// System.out.println(webWindowEvent.getWebWindow().getName());
// }
//
// @Override
// public void webWindowContentChanged(WebWindowEvent webWindowEvent) {
//
// }
//
// @Override
// public void webWindowClosed(WebWindowEvent webWindowEvent) {
//
// }
// });
// try {
// myPage = webClient2.getPage(request);
//
// } catch (IOException e) {
// e.printStackTrace();
// }
// Document doc = Jsoup.parse(myPage.asXml());
Document doc = null;
try {
doc = Jsoup
.connect(request2)
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
.timeout(15000)
.ignoreHttpErrors(true)
.get();
} catch (IOException e) {
e.printStackTrace();
}
//
// try {
// doc = res1.parse();
// } catch (IOException e) {
// e.printStackTrace();
// }
// // get all links
// Elements links = doc.select("a[href]");
// for (Element link : links) {
//
// String temp = link.attr("href");
// //use regex to get domain name
// result.add(temp);
// //break;
// }
Elements elements = null;
Elements dayElements = null;
List<Element> timeElements = new ArrayList<>();
if (doc != null) {
elements = doc.getElementsContainingOwnText("5,000ft temperature");
dayElements = doc.getElementsByClass("panel-heading");
Elements temp = doc.getElementsByClass("panel-body nextdays");
temp.forEach(element -> {
if (!element.getElementsByClass("col-xs-4 col-sm-3 panel-div").isEmpty()) {
timeElements.addAll(element.getElementsByClass("col-xs-4 col-sm-3 panel-div"));
}
});
}
List<Element> hours = new ArrayList<>();
hours = timeElements.stream().filter(p -> !p.text().equals("Time")).collect(Collectors.toList());
if (elements != null) {
for (int i = 0; i < elements.size(); i++) {
results.add(elements.get(i).nextElementSibling().text() + " " + getDateList(hours).get(i));
}
}
generateCSV(results);
if (results.size() == 0) return null;
else
return results;
}
}
package com.company;
import java.util.ArrayList;
public class Main {
public static void main(String[] args) {
FunnyCrawler funnyCrawler = new FunnyCrawler(args[0]);
ArrayList<String> results = funnyCrawler.getData();
}
}
Manifest-Version: 1.0
Main-Class: com.company.Main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment