Created
January 29, 2020 10:08
-
-
Save menon92/270d91f6b5b4049634fb5a93e6c7a1f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package scraper; | |
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.text.SimpleDateFormat; | |
import java.util.ArrayList; | |
import java.util.Date; | |
import java.util.Timer; | |
import java.util.TimerTask; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import org.jsoup.parser.Parser; | |
public class ProcessUrl { | |
public static String BASE_URL = "http://rss.cnn.com/rss/edition.rss"; | |
public static String BASE_PATH = System.getProperty("user.dir"); | |
public static String LOG_DIR = BASE_PATH + "/" + "data"; | |
public static String LOG_FILE = LOG_DIR + "/" + "a.rss"; | |
public static long delayForFetchedAndWrite = 1 * 1000; // will start after 1 seconds | |
public static long periodForFetchedAndWrite = 20 * 1000; // repeat after each 20 seconds | |
public static long delayForPrint = 1 * 1000; // will start after 1 seconds | |
public static long periodForPrint = 25 * 1000; // repeat after each 25 seconds | |
public static void main(String[] args) throws IOException { | |
createDirs(); | |
doWork(); | |
} | |
public static void doWork() { | |
Timer timer = new Timer(); | |
// thread for fetch data from urls and write to a file | |
TimerTask timerTaskFetchedAndWrite = new TimerTask() { | |
@Override | |
public void run() { | |
System.out.println("Scraping new urls ..."); | |
ArrayList<String> urls = getImageUrls(BASE_URL); | |
writeUrls(urls); | |
} | |
}; | |
// thread for read and print data for file | |
TimerTask timerTaskPrint = new TimerTask() { | |
@Override | |
public void run() { | |
System.out.println("Print data form file ..."); | |
try { | |
System.out.println("*********************"); | |
BufferedReader br = new BufferedReader(new FileReader(LOG_FILE)); | |
String line; | |
while ((line = br.readLine()) != null) { | |
System.out.println(line); | |
} | |
br.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
}; | |
// schedule task | |
timer.schedule(timerTaskFetchedAndWrite, delayForFetchedAndWrite, periodForFetchedAndWrite); | |
timer.schedule(timerTaskPrint, delayForPrint, periodForPrint); | |
} | |
public static void createDirs() { | |
try { | |
File files = new File(LOG_DIR); | |
if(!files.exists()) { | |
files.mkdir(); | |
System.out.println(LOG_DIR + " created ..."); | |
} else { | |
System.out.println(LOG_DIR + " Already exists ..."); | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
public static void writeUrls(ArrayList<String> urls){ | |
System.out.println("Writtng urls to file ..."); | |
String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); | |
try { | |
BufferedWriter writer; | |
writer = new BufferedWriter(new FileWriter(LOG_FILE, true)); | |
writer.write(timeStamp + "\n"); | |
for(String url: urls) { | |
writer.write(url + "\n"); | |
} | |
writer.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
public static ArrayList<String> getImageUrls(String baseUrl) { | |
ArrayList<String> imageUrls = new ArrayList<String>(); | |
// imageUrls.add("abc"); | |
// imageUrls.add("def"); | |
// imageUrls.add("def"); | |
try { | |
Document doc = Jsoup.connect(baseUrl).parser(Parser.xmlParser()).get(); | |
Elements elements = doc.getElementsByAttribute("url"); | |
for(Element element : elements) { | |
String imageUrl = element.attr("url"); | |
System.out.println("image ulr: " + imageUrl); | |
imageUrls.add(imageUrl); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return imageUrls; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment