Created
October 18, 2015 10:29
-
-
Save allaniftrue/15c7441360034b3bf973 to your computer and use it in GitHub Desktop.
Extract Easylist Adservers from the huge file for DNS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
# Get contents from easylist domain | |
$content = file_get_contents('easylist.txt'); | |
# Extract adserver list only | |
preg_match('/(\!\s\*\*\*\seasylist:easylist\/easylist_adservers\.txt\s\*\*\*)[^"]+(\!\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-Third\-party\sadverts\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\!)/', $content, $extracted, PREG_OFFSET_CAPTURE); | |
/* | |
* Remove unnecessary strings from every URL | |
*/ | |
$content = preg_replace('/\!\-\-\-(.*?)\-\-\-\!/', '', $extracted[0][0]); | |
$content = str_replace('||', '', $content); | |
$content = preg_replace('/\^(.*?).*/', '', $content); | |
# Convert list into an array for validation | |
$explode = explode("\n", $content); | |
$filteredList = []; | |
# Go through every url and IP then validate | |
foreach($explode as $url) { | |
if(filter_var('http://'.$url, FILTER_VALIDATE_URL) || filter_var('http://'.$url, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4|FILTER_FLAG_NO_PRIV_RANGE)) { | |
array_push($filteredList, $url); | |
} | |
} | |
# Extract unique domains from the list | |
$filteredList = array_unique($filteredList); | |
# Write result into a file for futher processing | |
file_put_contents('easylist_domains.txt', implode("\n", $filteredList)); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment