Last active
January 3, 2016 14:29
-
-
Save nuria/8476742 to your computer and use it in GitHub Desktop.
Removing pieces of user agent that are not needed for device, major and minor browser version extraction thus sanitizing UA. The percentage of user agents than can be identified uniquely in the set after running this scripts should be reduced by about a half. Example: if in a dataset of 80.000 we were able to identify uniquely 800 UA (1%) after …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
import re | |
# anonymizes ua | |
def sanitize(ua): | |
# from 1.1.2 ->1.1 | |
processed = re.sub(r'(\d+\.\d+)(\.(\d|\w)+)+', r'\1', ua) | |
# remove language headers en-EN | |
processed = re.sub(r'\s\w\w-\w\w(;|\s|\)){1}', r'\1', processed) | |
# blackberry language headers are: BlackBerry 9300; en) | |
processed = re.sub(r'(BlackBerry\s\d{4,};)\s\w\w\)', r'\1)', processed) | |
# AppleWebKit/537.17 -> AppleWebKit/537.17 | |
processed = re.sub(r'(AppleWebKit|Safari)/?(\d+)(\.\d*\+?)+', r'\1/\2', processed) | |
# Apple also has some bizarre versions for firmware:Mobile/10B329 -> Mobile/10 | |
# removing those as are not needed for device/os/browser id | |
# it is questionable whether these should be maintanined | |
processed = re.sub(r'Mobile/((\d|\w){2})(\w|\d)+ ', r'Mobile ', processed) | |
# remove apple versions 1_2_3 ->1_2 | |
processed = re.sub(r'(\d+_\d+)(_\d+)+', r'\1', processed) | |
# ignore everything after .NET,SV1,MSN,SIMBAR... in MS UA | |
processed = re.sub(r'\s(\.NET|BTR|GTB|SV1|MSN|SIMBAR|SLCC|MR(A|S)).*\)?', r')', processed) | |
# remove version of gecko, hardly relevant | |
processed = re.sub(r'rv:\d+\.\d+(\.\d+)?', r'', processed) | |
#remove Build/JZO54K Build/1.A_3 from android | |
processed = re.sub(r'\sBuild/(\w|\.|\d|-)+(\s|;)?', r'', processed) | |
# round dates to year 20100101 becomes 2010 | |
processed = re.sub(r'(\d\d\d\d)\d\d\d\d\s', r'\1 ', processed) | |
# remove stuff like (65FAA2DA-9457-4993-B310-98228E704BBE) | |
processed = re.sub(r'\(((\w|\d)-?)+\)', r'', processed) | |
return processed; | |
# to compare unique datasets before an after the algorithm | |
# sort | uniq -c | |
def main(): | |
# dataset had IP<space>UA | |
f = open('./some-data-set.txt') | |
lines = f.readlines(); | |
f.close() | |
for l in lines: | |
# ua chuncks | |
l = l.strip(); | |
items = l.split(); | |
ip = items[1]; | |
ua = " ".join(items[2:]); | |
#print ua; | |
aua = anonymize(ua) | |
print aua | |
if __name__ == "__main__": | |
main(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEFORE: | |
Mozilla/5.0 (Windows NT 6.2; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0 | |
Mozilla/5.0 (iPad; CPU OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) CriOS/31.0.1650.18 Mobile/11B554a Safari/8536.25 (B50A60CF-27BD-421B-A276-7688019CB295) | |
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) | |
UCWEB/2.0 (Linux; U; Adr 2.3;; Micromax A26) U2/1.0 UCBrowser/9.2 U2/1.0 Mobile | |
AFTER: | |
Mozilla/5.0 (Windows NT 6.2; WOW64; ) Gecko/2010 Firefox/26.0 | |
Mozilla/5.0 (iPad; CPU OS 7_0 like Mac OS X) AppleWebKit/537 (KHTML, like Gecko) CriOS/31.0 Mobile Safari/8536 | |
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0;) | |
UCWEB/2.0 (Linux; U; Adr 2.3;; Micromax A26) U2/1.0 UCBrowser/9.2 U2/1.0 Mobile |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment