Last active
June 24, 2023 09:05
-
-
Save ckhung/e2f5e1a13f27a2b4b0fa66c3c3fad471 to your computer and use it in GitHub Desktop.
line inspector w/ selenium
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Enter a python3 interpreter. Then do this: | |
# exec(open('linespector.py').read()) | |
# And try, line by line, the commented out code at the end. | |
# Wonderful reference: | |
# https://cosmocode.io/how-to-connect-selenium-to-an-existing-browser-that-was-opened-manually/ | |
# ( found from here: https://stackoverflow.com/a/70088095 ) | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import copy, re, base64, magic | |
def init(): | |
global driver | |
chrome_options = Options() | |
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") | |
chrome_driver = '/usr/bin/chromedriver' | |
driver = webdriver.Chrome(chrome_driver, options=chrome_options) | |
# print(driver.title) | |
# mime = magic.Magic(mime=True) | |
def get_all_tabs(): | |
global driver | |
all_tabs = {} | |
for handle in driver.window_handles: | |
driver.switch_to.window(handle) | |
all_tabs[driver.title] = handle | |
return all_tabs | |
def print_all_msg(chat): | |
msg_list = chat.find_all('div', {'data-timestamp':True})[::-1] | |
for msg in msg_list: | |
time_stamp = datetime.fromtimestamp(int(msg['data-timestamp'])//1000) | |
if 'messageDate-module__date_wrap__I4ily' in msg['class']: | |
print('### {} ###'.format(msg['data-message-content'])) | |
elif 'data-message-content-prefix' in msg.attrs: | |
msg2 = copy.copy(msg) | |
prefix = msg2['data-message-content-prefix'] | |
# text = msg2.find('div', {'class': 'messageLayout-module__content__PGz66'}) | |
to_del = msg2.find('button') | |
if to_del: to_del.replaceWith('') | |
to_del = msg2.find('span', {'class': 'metaInfo-module__read_count__8-U6j'}) | |
if to_del: to_del.replaceWith('') | |
to_del = msg2.find('time') | |
if to_del: to_del.replaceWith('') | |
to_del = msg2.find('pre', {'class': 'username-module__username__vGQGj'}) | |
if to_del: | |
uname = to_del.text | |
to_del.replaceWith('') | |
else: | |
uname = '*' | |
print('{} [{}] {}'.format(time_stamp.strftime('%H:%M'), uname, msg2.text)) | |
else: | |
# failed to recognize this | |
print(msg2.prettify()) | |
# https://stackoverflow.com/a/47425305 | |
def get_file_content_chrome(driver, uri): | |
result = driver.execute_async_script(""" | |
var uri = arguments[0]; | |
var callback = arguments[1]; | |
var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i[c]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);for(c=0;t-t%3>c;c+=3,o+=4)r=n[c]<<16|n[c+1]<<8|n[c+2],a[o]=i[r>>18],a[o+1]=i[r>>12&63],a[o+2]=i[r>>6&63],a[o+3]=i[63&r];return t%3===1?(r=n[t-1],a[o]=i[r>>2],a[o+1]=i[r<<4&63],a[o+2]=61,a[o+3]=61):t%3===2&&(r=(n[t-2]<<8)+n[t-1],a[o]=i[r>>10],a[o+1]=i[r>>4&63],a[o+2]=i[r<<2&63],a[o+3]=61),new TextDecoder("ascii").decode(a)}; | |
var xhr = new XMLHttpRequest(); | |
xhr.responseType = 'arraybuffer'; | |
xhr.onload = function(){ callback(toBase64(xhr.response)) }; | |
xhr.onerror = function(){ callback(xhr.status) }; | |
xhr.open('GET', uri); | |
xhr.send(); | |
""", uri) | |
if type(result) == int : | |
raise Exception("Request failed with status %s" % result) | |
return base64.b64decode(result) | |
def save_blob_as(blob, filepath): | |
with open(filepath, 'wb') as f: | |
blob_content = get_file_content_chrome(driver, blob) | |
# print(magic.from_buffer(blob_content)) | |
# https://github.com/ahupp/python-magic | |
f.write(blob_content) | |
def save_all_blobs(chat, path): | |
images = chat.find_all('img') | |
for img in images: | |
blob = img['src'] | |
m = re.search(r'/([\w-]{30,})$', blob) | |
if m is not None: | |
save_blob_as(blob, '{}/{}.jpg'.format(path, m.group(1))) | |
''' | |
init() | |
all_tabs = get_all_tabs() | |
line_handle = all_tabs['LINE'] | |
driver.switch_to.window(line_handle) | |
# After every switch to a new chat: | |
page_soup = BeautifulSoup(driver.page_source, 'html.parser') | |
# with open('a.htm', 'w') as f: f.write(page_soup.prettify()) | |
all_chats = page_soup.find_all('div', {'class': 'chatlistItem-module__chatlist_item__MOwxh'}) | |
current_chat = page_soup.find_all('div', {'class': 'message_list'}) | |
assert 1==len(current_chat) | |
print_all_msg(current_chat[0]) | |
save_all_blobs(current_chat[0], '/tmp/linespector') | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment