Skip to content

Instantly share code, notes, and snippets.

@iddoeldor
Created January 27, 2018 21:18
Show Gist options
  • Save iddoeldor/88331e80542020707dfb1b29a3b52f6a to your computer and use it in GitHub Desktop.
Save iddoeldor/88331e80542020707dfb1b29a3b52f6a to your computer and use it in GitHub Desktop.
Iterating over html sites, extracting file's (currently only images) metadata in parallel using java8, Jsoup & Apache Tika
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.jsoup.Jsoup;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class MetadataExtractor {
private static final ExecutorService threadPool = Executors.newFixedThreadPool(8);
/**
* load the 3rd column from CSV which contains all the urls
* remove .jpg or .xml suffix, we will handle them afterwards.. also wp-json cause issues
* taking 50 lines just to test
* awk -F "\"*,\"*" '{print $3}' Spider.csv | grep -v '.jpg\|.xml\|wp-comments-post.php\|wp-json' | head -n50 > urls.txt
*/
private static final String URLS_FILE = "../urls.txt";
/**
* unused metadata tags to reduce noise when printing each file metadata
*/
private static final Set<String> unnecessaryMetadata = new HashSet<>(Arrays.asList(
"X-Parsed-By", "height", "width", "Transparency Alpha", "Image Height", "Image Width", "Content-Type",
"Chroma BlackIsZero", "Compression NumProgressiveScans", "Dimension PixelAspectRatio",
"Compression Lossless"
));
public static void main(String[] args) throws Exception {
// loading each line which represent remote html file in parallel using Jsoup parser
// skip(1) = skipping first line because it is the column header ( "URI" )
// iterating over the <img> tag, extracting the "src" attribute
// loading the image file into memory & printing metadata using Apache Tika
Files.lines(Paths.get(URLS_FILE)).skip(1).parallel().forEach(line -> {
try {
// currently searching the html page for images but were using Tika's AutoDetectParser so it doesn't matter
Jsoup.connect(line).execute().parse().select("img").forEach(image -> threadPool.submit(() -> {
try {
String fileSource = image.absUrl("src");
Metadata metadata = new Metadata();
HttpClient client = HttpClientBuilder.create().build();
HttpGet httpUriRequest = new HttpGet(fileSource);
HttpResponse fileResponse = client.execute(httpUriRequest);
HttpEntity entity = fileResponse.getEntity();
if (entity != null) {
InputStream inputStream = entity.getContent();
BodyContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser();
parser.parse(inputStream, handler, metadata, new ParseContext());
}
// saving the html page which we extracted the image from
metadata.add("X-Original-URL", line);
metadata.add("X-Image-Src", fileSource);
// remove unnecessary metadata noise..
Arrays.asList(metadata.names())
.stream()
.filter(s -> !unnecessaryMetadata.contains(s))
// todo consider json output (mongodb)
.forEach(name -> System.out.printf("\t%s=%s\n", name, metadata.get(name).replace("\n", "")));
System.out.println("---");
} catch (Exception e) {
System.err.printf("Task error [%s] \t%s\n", e.getMessage(), line);
}
}));
} catch (Exception e) {
System.err.println(e.getMessage() + "\t" + line);
}
});
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment