Created
January 9, 2019 10:29
-
-
Save AdroitAnandAI/20e489bc2a21b50b82b006ee7875bc41 to your computer and use it in GitHub Desktop.
HTML Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fn to parse one html file and | |
def parse_html(filename): | |
contents = [] | |
with open(filename, encoding='utf8') as infile: | |
soup = BeautifulSoup(infile, "html.parser") | |
for div in soup.find_all('div', {'class':['from_name', 'text']}): | |
contents.append(div.text) | |
chats = [content.replace("\n", "") for content in contents] | |
chats = [chat.strip() for chat in chats] | |
chats = [chat for chat in chats if not 'https' in chat] | |
chat_lower = [w.lower() for w in chats] | |
# This will strip off unwanted chars like emojis | |
clean_chat = [w for w in chat_lower if ifPermissible(w)] | |
# To Convert conversation in model readable format | |
yml_chat = [] | |
persona_dialogue=[] | |
for i in range(len(clean_chat)): | |
chat_line = clean_chat[i] | |
if (persona_1 in chat_line or persona_2 in chat_line): | |
# start_list = True | |
chat_line = chat_line.replace(persona_1, "- - ") | |
chat_line = chat_line.replace(persona_2, " - ") | |
yml_chat = yml_chat + persona_dialogue + ['\n' + chat_line] | |
persona_dialogue = [] | |
else: | |
persona_dialogue.append(chat_line.strip() + " ") | |
yml = ''.join(yml_chat) | |
# To strip of date and time from forward messages. | |
yml = re.sub('\d+.\d+.\d+|\d+:\d+:\d', '', yml) | |
with open(filename+".txt", "w", encoding='utf8') as output: | |
output.write(str(yml)) | |
# iterate the folder to process all files inside. | |
for file in os.listdir(DATA_DIR_PATH): | |
filepath = os.path.join(DATA_DIR_PATH, file) | |
if os.path.isfile(filepath): | |
parse_html(filepath) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment