Last active
March 11, 2019 14:58
-
-
Save jhamberg/18f0d90701d2acf5f37ad30c7660e994 to your computer and use it in GitHub Desktop.
Find application category on Android without dependencies by parsing the HTML from Play Store
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.NoSuchElementException; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
// Created by Jonatan Hamberg on 11.3.2019. | |
// (C) 2019 - University of Helsinki | |
public class PlayCrawler { | |
// Find an opening anchor tag "<a" followed by any number of non-closing characters (spaces | |
// or attributes) preceding an itemprop for genre "itemprop=\"genre\"". Following the genre, | |
// again, find any number of non-closing characters "[^>]+" preceding an href attribute | |
// "href=\"" with a value prefixed with either http or https "https?://". Proceeding with | |
// the value, find the Play Store url "play.google.com/store/apps/category/" and capture its | |
// last non-closing segment "([^/"]+)". This capture represents the application category. | |
private static final String PATTERN_CATEGORY = | |
"<a itemprop=\"genre\"[^>]+" + | |
"href=\"https?://play.google.com/store/apps/category/([^\"]+)\""; | |
public static String parseCategory(String html) { | |
Pattern pattern = Pattern.compile(PATTERN_CATEGORY); | |
Matcher matcher = pattern.matcher(clean(html)); | |
if (matcher.find() && matcher.groupCount() > 1) { | |
return matcher.group(1); | |
} | |
return null; | |
} | |
private static String clean(String html) { | |
// Replace all newlines with a space and then trim all repeating whitespaces. This helps | |
// with html tags that span multiple lines that might or might not have trailing spaces | |
// before the newline. | |
return html.replace("\n", " ") | |
.replaceAll(" +", " "); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment