iddoeldor/MetadataExtractor.java

## MetadataExtractor.java
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.jsoup.Jsoup;

import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class MetadataExtractor {

    private static final ExecutorService threadPool = Executors.newFixedThreadPool(8);
    /**
     * load the 3rd column from CSV which contains all the urls
     * remove .jpg or .xml suffix, we will handle them afterwards.. also wp-json cause issues
     * taking 50 lines just to test
     * awk -F "\"*,\"*" '{print $3}' Spider.csv | grep -v '.jpg\|.xml\|wp-comments-post.php\|wp-json' | head -n50 > urls.txt
     */
    private static final String URLS_FILE = "../urls.txt";
    /**
     * unused metadata tags to reduce noise when printing each file metadata
     */
    private static final Set<String> unnecessaryMetadata = new HashSet<>(Arrays.asList(
            "X-Parsed-By", "height", "width", "Transparency Alpha", "Image Height", "Image Width", "Content-Type",
            "Chroma BlackIsZero", "Compression NumProgressiveScans", "Dimension PixelAspectRatio",
            "Compression Lossless"
    ));

    public static void main(String[] args) throws Exception {
        // loading each line which represent remote html file in parallel using Jsoup parser
        // skip(1) = skipping first line because it is the column header ( "URI" )
        // iterating over the <img> tag, extracting the "src" attribute
        // loading the image file into memory & printing metadata using Apache Tika
        Files.lines(Paths.get(URLS_FILE)).skip(1).parallel().forEach(line -> {
            try {
                // currently searching the html page for images but were using Tika's AutoDetectParser so it doesn't matter
                Jsoup.connect(line).execute().parse().select("img").forEach(image -> threadPool.submit(() -> {
                    try {
                        String fileSource = image.absUrl("src");
                        Metadata metadata = new Metadata();
                        HttpClient client = HttpClientBuilder.create().build();
                        HttpGet httpUriRequest = new HttpGet(fileSource);
                        HttpResponse fileResponse = client.execute(httpUriRequest);
                        HttpEntity entity = fileResponse.getEntity();
                        if (entity != null) {
                            InputStream inputStream = entity.getContent();
                            BodyContentHandler handler = new BodyContentHandler();
                            Parser parser = new AutoDetectParser();
                            parser.parse(inputStream, handler, metadata, new ParseContext());
                        }
                        // saving the html page which we extracted the image from
                        metadata.add("X-Original-URL", line);
                        metadata.add("X-Image-Src", fileSource);
                        // remove unnecessary metadata noise..
                        Arrays.asList(metadata.names())
                            .stream()
                            .filter(s -> !unnecessaryMetadata.contains(s))
                            // todo consider json output (mongodb)
                            .forEach(name -> System.out.printf("\t%s=%s\n", name, metadata.get(name).replace("\n", "")));
                        System.out.println("---");
                    } catch (Exception e) {
                        System.err.printf("Task error [%s] \t%s\n", e.getMessage(), line);
                    }
                }));
            } catch (Exception e) {
                System.err.println(e.getMessage() + "\t" + line);
            }
        });
    }

}
	import org.apache.http.HttpEntity;
	import org.apache.http.HttpResponse;
	import org.apache.http.client.HttpClient;
	import org.apache.http.client.methods.HttpGet;
	import org.apache.http.impl.client.HttpClientBuilder;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.sax.BodyContentHandler;
	import org.jsoup.Jsoup;

	import java.io.InputStream;
	import java.nio.file.Files;
	import java.nio.file.Paths;
	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.Set;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;

	public class MetadataExtractor {

	private static final ExecutorService threadPool = Executors.newFixedThreadPool(8);
	/**
	* load the 3rd column from CSV which contains all the urls
	* remove .jpg or .xml suffix, we will handle them afterwards.. also wp-json cause issues
	* taking 50 lines just to test
	* awk -F "\",\"" '{print $3}' Spider.csv \| grep -v '.jpg\\|.xml\\|wp-comments-post.php\\|wp-json' \| head -n50 > urls.txt
	*/
	private static final String URLS_FILE = "../urls.txt";
	/**
	* unused metadata tags to reduce noise when printing each file metadata
	*/
	private static final Set<String> unnecessaryMetadata = new HashSet<>(Arrays.asList(
	"X-Parsed-By", "height", "width", "Transparency Alpha", "Image Height", "Image Width", "Content-Type",
	"Chroma BlackIsZero", "Compression NumProgressiveScans", "Dimension PixelAspectRatio",
	"Compression Lossless"
	));

	public static void main(String[] args) throws Exception {
	// loading each line which represent remote html file in parallel using Jsoup parser
	// skip(1) = skipping first line because it is the column header ( "URI" )
	// iterating over the <img> tag, extracting the "src" attribute
	// loading the image file into memory & printing metadata using Apache Tika
	Files.lines(Paths.get(URLS_FILE)).skip(1).parallel().forEach(line -> {
	try {
	// currently searching the html page for images but were using Tika's AutoDetectParser so it doesn't matter
	Jsoup.connect(line).execute().parse().select("img").forEach(image -> threadPool.submit(() -> {
	try {
	String fileSource = image.absUrl("src");
	Metadata metadata = new Metadata();
	HttpClient client = HttpClientBuilder.create().build();
	HttpGet httpUriRequest = new HttpGet(fileSource);
	HttpResponse fileResponse = client.execute(httpUriRequest);
	HttpEntity entity = fileResponse.getEntity();
	if (entity != null) {
	InputStream inputStream = entity.getContent();
	BodyContentHandler handler = new BodyContentHandler();
	Parser parser = new AutoDetectParser();
	parser.parse(inputStream, handler, metadata, new ParseContext());
	}
	// saving the html page which we extracted the image from
	metadata.add("X-Original-URL", line);
	metadata.add("X-Image-Src", fileSource);
	// remove unnecessary metadata noise..
	Arrays.asList(metadata.names())
	.stream()
	.filter(s -> !unnecessaryMetadata.contains(s))
	// todo consider json output (mongodb)
	.forEach(name -> System.out.printf("\t%s=%s\n", name, metadata.get(name).replace("\n", "")));
	System.out.println("---");
	} catch (Exception e) {
	System.err.printf("Task error [%s] \t%s\n", e.getMessage(), line);
	}
	}));
	} catch (Exception e) {
	System.err.println(e.getMessage() + "\t" + line);
	}
	});
	}

	}