Skip to content

Instantly share code, notes, and snippets.

@menon92
Created January 29, 2020 10:08
Show Gist options
  • Save menon92/270d91f6b5b4049634fb5a93e6c7a1f0 to your computer and use it in GitHub Desktop.
Save menon92/270d91f6b5b4049634fb5a93e6c7a1f0 to your computer and use it in GitHub Desktop.
package scraper;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Timer;
import java.util.TimerTask;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.parser.Parser;
public class ProcessUrl {
public static String BASE_URL = "http://rss.cnn.com/rss/edition.rss";
public static String BASE_PATH = System.getProperty("user.dir");
public static String LOG_DIR = BASE_PATH + "/" + "data";
public static String LOG_FILE = LOG_DIR + "/" + "a.rss";
public static long delayForFetchedAndWrite = 1 * 1000; // will start after 1 seconds
public static long periodForFetchedAndWrite = 20 * 1000; // repeat after each 20 seconds
public static long delayForPrint = 1 * 1000; // will start after 1 seconds
public static long periodForPrint = 25 * 1000; // repeat after each 25 seconds
public static void main(String[] args) throws IOException {
createDirs();
doWork();
}
public static void doWork() {
Timer timer = new Timer();
// thread for fetch data from urls and write to a file
TimerTask timerTaskFetchedAndWrite = new TimerTask() {
@Override
public void run() {
System.out.println("Scraping new urls ...");
ArrayList<String> urls = getImageUrls(BASE_URL);
writeUrls(urls);
}
};
// thread for read and print data for file
TimerTask timerTaskPrint = new TimerTask() {
@Override
public void run() {
System.out.println("Print data form file ...");
try {
System.out.println("*********************");
BufferedReader br = new BufferedReader(new FileReader(LOG_FILE));
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
}
};
// schedule task
timer.schedule(timerTaskFetchedAndWrite, delayForFetchedAndWrite, periodForFetchedAndWrite);
timer.schedule(timerTaskPrint, delayForPrint, periodForPrint);
}
public static void createDirs() {
try {
File files = new File(LOG_DIR);
if(!files.exists()) {
files.mkdir();
System.out.println(LOG_DIR + " created ...");
} else {
System.out.println(LOG_DIR + " Already exists ...");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void writeUrls(ArrayList<String> urls){
System.out.println("Writtng urls to file ...");
String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
try {
BufferedWriter writer;
writer = new BufferedWriter(new FileWriter(LOG_FILE, true));
writer.write(timeStamp + "\n");
for(String url: urls) {
writer.write(url + "\n");
}
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static ArrayList<String> getImageUrls(String baseUrl) {
ArrayList<String> imageUrls = new ArrayList<String>();
// imageUrls.add("abc");
// imageUrls.add("def");
// imageUrls.add("def");
try {
Document doc = Jsoup.connect(baseUrl).parser(Parser.xmlParser()).get();
Elements elements = doc.getElementsByAttribute("url");
for(Element element : elements) {
String imageUrl = element.attr("url");
System.out.println("image ulr: " + imageUrl);
imageUrls.add(imageUrl);
}
} catch (IOException e) {
e.printStackTrace();
}
return imageUrls;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment