Created
January 27, 2018 21:18
-
-
Save iddoeldor/88331e80542020707dfb1b29a3b52f6a to your computer and use it in GitHub Desktop.
Iterating over html sites, extracting file's (currently only images) metadata in parallel using java8, Jsoup & Apache Tika
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.http.HttpEntity; | |
import org.apache.http.HttpResponse; | |
import org.apache.http.client.HttpClient; | |
import org.apache.http.client.methods.HttpGet; | |
import org.apache.http.impl.client.HttpClientBuilder; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.sax.BodyContentHandler; | |
import org.jsoup.Jsoup; | |
import java.io.InputStream; | |
import java.nio.file.Files; | |
import java.nio.file.Paths; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
public class MetadataExtractor { | |
private static final ExecutorService threadPool = Executors.newFixedThreadPool(8); | |
/** | |
* load the 3rd column from CSV which contains all the urls | |
* remove .jpg or .xml suffix, we will handle them afterwards.. also wp-json cause issues | |
* taking 50 lines just to test | |
* awk -F "\"*,\"*" '{print $3}' Spider.csv | grep -v '.jpg\|.xml\|wp-comments-post.php\|wp-json' | head -n50 > urls.txt | |
*/ | |
private static final String URLS_FILE = "../urls.txt"; | |
/** | |
* unused metadata tags to reduce noise when printing each file metadata | |
*/ | |
private static final Set<String> unnecessaryMetadata = new HashSet<>(Arrays.asList( | |
"X-Parsed-By", "height", "width", "Transparency Alpha", "Image Height", "Image Width", "Content-Type", | |
"Chroma BlackIsZero", "Compression NumProgressiveScans", "Dimension PixelAspectRatio", | |
"Compression Lossless" | |
)); | |
public static void main(String[] args) throws Exception { | |
// loading each line which represent remote html file in parallel using Jsoup parser | |
// skip(1) = skipping first line because it is the column header ( "URI" ) | |
// iterating over the <img> tag, extracting the "src" attribute | |
// loading the image file into memory & printing metadata using Apache Tika | |
Files.lines(Paths.get(URLS_FILE)).skip(1).parallel().forEach(line -> { | |
try { | |
// currently searching the html page for images but were using Tika's AutoDetectParser so it doesn't matter | |
Jsoup.connect(line).execute().parse().select("img").forEach(image -> threadPool.submit(() -> { | |
try { | |
String fileSource = image.absUrl("src"); | |
Metadata metadata = new Metadata(); | |
HttpClient client = HttpClientBuilder.create().build(); | |
HttpGet httpUriRequest = new HttpGet(fileSource); | |
HttpResponse fileResponse = client.execute(httpUriRequest); | |
HttpEntity entity = fileResponse.getEntity(); | |
if (entity != null) { | |
InputStream inputStream = entity.getContent(); | |
BodyContentHandler handler = new BodyContentHandler(); | |
Parser parser = new AutoDetectParser(); | |
parser.parse(inputStream, handler, metadata, new ParseContext()); | |
} | |
// saving the html page which we extracted the image from | |
metadata.add("X-Original-URL", line); | |
metadata.add("X-Image-Src", fileSource); | |
// remove unnecessary metadata noise.. | |
Arrays.asList(metadata.names()) | |
.stream() | |
.filter(s -> !unnecessaryMetadata.contains(s)) | |
// todo consider json output (mongodb) | |
.forEach(name -> System.out.printf("\t%s=%s\n", name, metadata.get(name).replace("\n", ""))); | |
System.out.println("---"); | |
} catch (Exception e) { | |
System.err.printf("Task error [%s] \t%s\n", e.getMessage(), line); | |
} | |
})); | |
} catch (Exception e) { | |
System.err.println(e.getMessage() + "\t" + line); | |
} | |
}); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment