Skip to content

Instantly share code, notes, and snippets.

@ozzi-
Last active June 11, 2020 09:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ozzi-/7087505de8114df5cee8aed26532d61f to your computer and use it in GitHub Desktop.
Save ozzi-/7087505de8114df5cee8aed26532d61f to your computer and use it in GitHub Desktop.
removes all subdomains of an url
public static String removeSubdomains(String url, ArrayList<String> secondLevelDomains) {
// We need our URL in three parts, protocol - domain - path
String protocol= getProtocol(url);
url = url.substring(protocol.length());
String urlDomain=url;
String path="";
if(urlDomain.contains("/")) {
int slashPos = urlDomain.indexOf("/");
path=urlDomain.substring(slashPos);
urlDomain=urlDomain.substring(0, slashPos);
}
// Done, now let us count the dots . .
int dotCount = Strng.countOccurrences(urlDomain, ".");
// example.com <-- nothing to cut
if(dotCount==1){
return protocol+url;
}
int dotOffset=2; // subdomain.example.com <-- default case, we want to remove everything before the 2nd last dot
// however, somebody had the glorious idea, to have second level domains, such as co.uk
for (String secondLevelDomain : secondLevelDomains) {
// we need to check if our domain ends with a second level domain
// example: something.co.uk we don't want to cut away "something", since it isn't a subdomain, but the actual domain
if(urlDomain.endsWith(secondLevelDomain)) {
// we increase the dot offset with the amount of dots in the second level domain (co.uk = +1)
dotOffset += Strng.countOccurrences(secondLevelDomain, ".");
break;
}
}
// if we have something.co.uk, we have a offset of 3, but only 2 dots, hence nothing to remove
if(dotOffset>dotCount) {
return protocol+urlDomain+path;
}
// if we have sub.something.co.uk, we have a offset of 3 and 3 dots, so we remove "sub"
int pos = Strng.nthLastIndexOf(dotOffset, ".", urlDomain)+1;
urlDomain = urlDomain.substring(pos);
return protocol+urlDomain+path;
}
public static String getProtocol(String url) {
String containsProtocolPattern = "^([a-zA-Z]*:\\/\\/)|^(\\/\\/)";
Pattern pattern = Pattern.compile(containsProtocolPattern);
Matcher m = pattern.matcher(url);
if (m.find()) {
return m.group();
}
return "";
}
public static ArrayList<String> getPublicSuffixList(boolean loadFromPublicSufficOrg) {
ArrayList<String> secondLevelDomains = new ArrayList<String>();
if(!loadFromPublicSufficOrg) {
secondLevelDomains.add("co.uk");secondLevelDomains.add("co.at");secondLevelDomains.add("or.at");secondLevelDomains.add("ac.at");secondLevelDomains.add("gv.at");secondLevelDomains.add("ac.at");secondLevelDomains.add("ac.uk");secondLevelDomains.add("gov.uk");secondLevelDomains.add("ltd.uk");secondLevelDomains.add("fed.us");secondLevelDomains.add("isa.us");secondLevelDomains.add("nsn.us");secondLevelDomains.add("dni.us");secondLevelDomains.add("ac.ru");secondLevelDomains.add("com.ru");secondLevelDomains.add("edu.ru");secondLevelDomains.add("gov.ru");secondLevelDomains.add("int.ru");secondLevelDomains.add("mil.ru");secondLevelDomains.add("net.ru");secondLevelDomains.add("org.ru");secondLevelDomains.add("pp.ru");secondLevelDomains.add("com.au");secondLevelDomains.add("net.au");secondLevelDomains.add("org.au");secondLevelDomains.add("edu.au");secondLevelDomains.add("gov.au");
}
try {
String a = URLHelpers.getHTTP("https://publicsuffix.org/list/public_suffix_list.dat", false, true);
Scanner scanner = new Scanner(a);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if(!line.startsWith("//") && !line.startsWith("*") && line.contains(".")) {
secondLevelDomains.add(line);
}
}
scanner.close();
} catch (Exception e) {
e.printStackTrace();
}
return secondLevelDomains;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment