Skip to content

Instantly share code, notes, and snippets.

@sbutterfield
Created June 21, 2022 16:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sbutterfield/952dd3f702fcb54daebd2405e8eb4cc9 to your computer and use it in GitHub Desktop.
Save sbutterfield/952dd3f702fcb54daebd2405e8eb4cc9 to your computer and use it in GitHub Desktop.
apex uri utils for matching and parsing URI & URL groups
/**
* @author: Shawn Butterfield
* Utility class for URI (URL/URN/Email) data validation, parsing and matching
* Methods for pattern extraction defined by URI standard syntax, here: http://en.wikipedia.org/wiki/URI_scheme#Generic_syntax
*/
public class URIUtils {
private static final String REGEX_URL_SCHEME = '^([a-zA-Z][a-zA-Z0-9\\+\\.\\-]+):';
private static final String REGEX_URI_EMAIL_PATTERN = getUrlPatternString();
private static final String[] UNSAFE_CHARS = new String[] { ',','.','&','_','0','Inc','Corp','Co','Ltd' }; // Strings or fragments that can screw up an api request due to encoding
public static Boolean isValidEmail(String input) {
return null;
}
public static String safeEncode(String input) {
String result;
if (input == null) {
return input;
}
for (Integer i=0; i<UNSAFE_CHARS.size(); i++) {
// The first four instances of unsafe chars could exist anywhere in the string
// and for any occurrence, we would want to replace them
if (i<=4) {
input = input.replace(UNSAFE_CHARS[i], ' ');
}
// We only want to replace the unsafe chars from the array if they exist at the
// end of the string
if (i>4) {
if (input.endsWith(UNSAFE_CHARS[i])) {
input = input.replace(UNSAFE_CHARS[i], ' ');
}
}
i++;
}
result = input.trim();
return result;
}
public static Boolean isValidUri(String input) {
return null;
}
public static Boolean hasSubDomain(String input) {
return null;
}
public static String parseSchemeFromUri(String input) {
return null;
}
public static String ensureStartsWithHttp(String url) {
if(url == null) {
return url;
}
String urlString;
try {
Pattern p = Pattern.compile(REGEX_URL_SCHEME);
Matcher m = p.matcher(url);
if (m.find()) {
urlString = url;
}
else {
urlString = 'http://' + url;
return urlString;
}
}
catch(Exception e) {
return url;
}
return url;
}
public static String parseDomain(String input) {
if(String.isBlank(input)) {
return input;
}
String domain;
try {
System.debug( 'PD - INPUT: ' + input );
Pattern p = Pattern.compile(REGEX_URI_EMAIL_PATTERN);
Matcher m = p.matcher(input);
while(m.matches()) {
/* Fix inputs with "@" character in a parameter that can screw up parsing, remove anything after the last occurence of "/" char
* Group0 = Input.
* Group4 should be null
* Group2 should always have more than one dot character if this is still a URI
* Finish by re-submitting the input, a given input may hit this logic more than once before being declared valid or invalid
*/
if(
m.group(0).contains('@') &&
(m.group(4) == null && m.group(2).countMatches('.') >= 1)
&& m.group(0).contains('/')
)
{
input = m.group(0).substringBeforeLast('/');
System.debug( 'PD - INPUT INSIDE: ' + input );
m = m.reset(input);
}
// This would catch an input that has a valid email address for domain parsing in it but may contain other characteristics as well.
else if(
m.group(0).contains('@')
&&
(
m.group(2) != null
&&
m.group(5) != null
&&
m.group(6) != null
)
)
{
// Handle ccSLD's and sub-domains in email addresses
if(
m.group(0).countMatches('.') >= 2
||
(m.group(5).length() <= 2 || m.group(6).length() <= 2)
)
{
System.debug( 'PD - Group 0: ' + m.group(0) );
domain = parseOutCcSld(m, input);
return domain;
}
else {
domain = m.group(5) + '.' + m.group(6);
domain.toLowerCase();
return domain;
}
}
// This branch will check for sub-domains and ccSLD's in a URI and parse accordingly.
else if(m.group(0).countMatches('.') > 2 || (m.group(5).length() <= 2 || m.group(6).length() <= 2)) {
domain = parseOutCcSld(m, input);
return domain;
}
// Final branch, this just assumes a clean URI and extracts the domain
else {
domain = m.group(5) + '.' + m.group(6);
domain = domain.toLowerCase();
return domain;
}
}
}
catch (Exception e) {
System.debug(LoggingLevel.ERROR,'An error occurred while parsing domain, returning to input stream ' +e.getStackTraceString());
return input;
}
return domain;
}
public static String parseDomainWithSubdomain(String input) {
return null;
}
public static String parseParameters(String input) {
return null;
}
public static String parseParameter(String input, Integer n) {
return null;
}
private static String parseOutCcSld(Matcher m, String input) {
String domain;
// Group 4,5,6 must not be null and always return a domain combination including Group 4 unless Group 5 is shorter than 3 chars (ie: "salesforce.co.uk" versus "www.salesforce.com")
if(
m.group(4) != null
&&
m.group(5) != null
&&
m.group(6) != null
&&
m.group(5).length() <= 3
)
{
if(m.group(4).countMatches('.') > 0) {
domain = m.group(4).substringAfterLast('.') + '.';
}
else domain = m.group(4) + '.';
}
// System.debug( 'PD - Group 4: ' + m.group(4) );
// System.debug( 'PD - Group 5: ' + m.group(5) );
// System.debug( 'PD - Group 5: ' + m.group(6) );
// System.debug( 'PD - Domain: ' + domain );
if(domain != null) {
// Now add on the proper ccTLD and ccSLD
domain += m.group(5) + '.' + m.group(6);
}
else {
// Otherwise domain is clean with or without country code
domain = m.group(5) + '.' + m.group(6);
}
domain.toLowerCase();
return domain;
}
/**
* Current URL Regex Pattern builds a match using 11 different capture groups which cover:
* Scheme/Protocol: http, https, ftp(s), uri, afp, mailto, service, email address (sbutterfield@salesforce.com) etc.
* Fully Qualified Scheme: http://, https://, afp://
* Authority/Domain: www.salesforce.com, salesforce.com
* TLD: .com, .sk
* ccSLDs: .co.uk, .com.br etc
* IP Address (v4)
* Port Number (ICANN)
* Path, Sub Path and File Name: /pub/Main.aspx
* Query: ?adp=1
* Fragment: .aspx, .asp, %$query
* https://www.usb.regexlib.com:6553/Search.aspx?query=1#%Metadata >> yeilds:
* Group0: [INPUT] https://www.usb.regexlib.com:6553/Search.aspx?query=1#%Metadata
* Group1: https
* Group2: [No Match]
* Group3: [No Match]
* Group4: www.usb
* Group5: regexlib
* Group6: com
* Group7: 6553
* Group8: [No Match]
* Group9: /Search.aspx
* Group10: query=1
* Group11: %Metadata
*/
private static String getURLPatternString() {
String result;
try {
Blob content = [Select Body from StaticResource where Name = 'REGEX_URI_EMAIL_PATTERN'].Body;
if(content.size() > 0) {
result = content.toString();
}
}
catch(Exception e) {
System.debug(LoggingLevel.ERROR,'Unable to retrieve REGEX_URI_EMAIL_PATTERN as a string from static resources.' +e);
}
return result;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment