Skip to content

Instantly share code, notes, and snippets.

@Thomas-Rosenkrans-Vestergaard
Last active February 13, 2020 20:23
Show Gist options
  • Save Thomas-Rosenkrans-Vestergaard/524dfc5d4d922226631ebe89943914c6 to your computer and use it in GitHub Desktop.
Save Thomas-Rosenkrans-Vestergaard/524dfc5d4d922226631ebe89943914c6 to your computer and use it in GitHub Desktop.
import re
import subprocess
regex = """<script>var[\s]+profile[\s]*=[\s]*({[^å]+)(?=</script>)"""
test_str = ("""
<script>var profile = {"env":null,
"id":"xxx","listingStatus":"xxx",
"price":{"value":"xxxx","text":"xxxx"},
"createdDate":{"value":"xxxx","text":"xxxx","ago":"xxxxx"},
"title":"Xxxxx",
"text":"Xxxxx",
"matrixData":[{"label":"Mærke ","value":"Xxxxx"},
{"label":"Model","value":"Xxxxx"},
{"label":"Xxxxx"},
{"label":"Stand","value":"Xxxxx"}],
"images":[{"small":"Xxxxx","medium":"Xxxxx","large":"Xxxxx"},
{"small":"Xxxxx","medium":"Xxxxx","large":"Xxxxx"},
{"small":"Xxxxx","medium":"Xxxxx","large":"Xxxxx"},
{"small":"Xxxxx","medium":"Xxxxx","large":"Xxxxx"},
{"small":"Xxxxx","medium":"Xxxxx","large":"Xxxxx"}],
"breadcrumb":{"level1":{"name":"Xxxxx",
"id":"Xxxxx",
"url":"Xxxxx"},
"level2":{"name":"Xxxxx",
"id":"Xxxxx",
"url":"XXXX/"},
"level3":{"name":"XXXX",
"id":"XXXX",
"url":"XXXX"}
},
"headMeta":{"title":"XXXX",
"description":"XXXX",
"url":"XXXX",
"price":"XXXX",
"image":"XXXX"
},
"profile":{"type":"XXXX",
"typeText":"XXXX",
"ownerId":"XXXX",
"name":"XXXX",
"address":{"address":"XXXX",
"zip":"XXXX",
"city":"XXXX",
"country":"XXXX",
"geoLocation":{"latitude":0,"longitude":0}
},
"nemIdValidated":null,
"website":null,
"registrationDate":{"value":"XXXX",
"text":"XXXX",
"ago":"XXXX"},
"profilePicture":{"url":"XXXX",
"name":"XXXX"},
"cvr":null,
"usPs":null,
"openingHours":null,
"soiLink":"XXXX",
"contactOptions":[{"type":"XXXX",
"data":[],
"title":"XXXX"}],
"mapLink":null,
"isFollowedByUser":false},
"links":[{"caption":"XXXX",
"rel":"XXXX",
"verb":"XXXX",
"href":"XXXX",
"gaAction":"XXXX"}],
"dataLayer":{"p":{"t":"XXXX",
"pl":"XXXX",
"v":"XXXX"},
"a":{"id":0,
"t":"XXXX",
"cdt":0,
"age":0,
"lpdt":0,
"dl":0,
"ic":0,
"prc":{"amt":0,
"cur":"xxx"},
"ftr":["QuestionsAndAnswers",
"VisibleContactInformation",
"Shipping"],
"u":{"li":false,
"huid":"XXXX",
"hue":"XXXX",
"at":"XXXX"},
"attr":{"Make":"XXXX",
"Condition":"XXXX",
"Model":"XXXX",
"Memory":"XXXX",
"SELLER_TYPE":"XXXX"},
"syiId":"XXXX"},
"c":{"l1":{"id":0,
"n":"XXXX"},
"l2":{"id":0,
"n":"XXXX"},
"l3":{"id":0,
"n":"XXXX"},
"c":{"id":0,"n":"XXXX"}},
"l":{"c":{"id":0,"n":"XXXX"},
"l1":{"id":0,"n":"XXXX"},
"l2":{"id":0,"n":"XXXX"},
"l3":{"id":0,"n":"XXXX"},
"l4":{"id":0,"n":"XXXX"},
"ltlng":"xxxx",
"pcid":"0"}},
"relatedListings":[{"links":[{"rel":"self",
"verb":"GET",
"href":"xxxx"},
{"rel":"image","verb":"GET","href":"xxxx"}],
"text":"xxxx",
"price":{"value":0,
"text":"2.139 kr."}},
{"links":[{"rel":"self","verb":"GET","href":"xxxx"},
{"rel":"image","verb":"GET","href":"xxxx"}],
"text":"xxxx",
"price":{"value":0,"text":"xxxx"}},
{"links":[{"rel":"self","verb":"GET","href":"xxxx"},
{"rel":"image","verb":"GET","href":"xxxx"}],
"text":"xxxx","price":{"value":0,
"text":"xxxx"}}],
"bilbasenRelatedListings":[],
"popularSearches":[{"query":"xxxx","url":"xxxx","label":"xxxx"},
{"query":"xxxx","url":"xxxx","label":"mobiltelefoner"},
{"query":"xxxx","url":"xxxx","label":"mobiltelefoner"},
{"query":"xxxx","url":"xxxx","label":"mobiltelefoner"},
{"query":"xxxx","url":"xxxx","label":"mobiltelefoner"},
],
"sellersOtherItems":null,
"adsense":{"format":"plas",
"pageLevelParameters":{"testgl":"da",
"textColorPalette":["#333333"],
"priceCurrency":"xxxx",
"hl":"xxxx",
"adsafe":"medium",
"pubId":"xxxx",
"query":"xxxx",
"channel":"xxxx", "target":"_blank","adLayout":"xxxx"}},
"advertising":null,"isFavorite":false};
</script>""")
def to_json(javascript):
javascript = javascript.rstrip(' ;\n\r')
try:
cmd = ['node', '-e', """console.log(JSON.stringify(""" + javascript + """))"""]
out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True)
except subprocess.CalledProcessError as exc:
# dette var ikke korrekt javascript
return (False, exc.output)
else:
# dette var korrekt javascript
return (True, out)
matches = re.finditer(regex, test_str, re.MULTILINE | re.DOTALL)
for matchNum, match in enumerate(matches, start=1):
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
output = to_json(match.group(groupNum))
if(output[0]):
print(output[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment