Skip to content

Instantly share code, notes, and snippets.

@AdroitAnandAI
Created January 9, 2019 10:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AdroitAnandAI/20e489bc2a21b50b82b006ee7875bc41 to your computer and use it in GitHub Desktop.
Save AdroitAnandAI/20e489bc2a21b50b82b006ee7875bc41 to your computer and use it in GitHub Desktop.
HTML Parser
# Fn to parse one html file and
def parse_html(filename):
contents = []
with open(filename, encoding='utf8') as infile:
soup = BeautifulSoup(infile, "html.parser")
for div in soup.find_all('div', {'class':['from_name', 'text']}):
contents.append(div.text)
chats = [content.replace("\n", "") for content in contents]
chats = [chat.strip() for chat in chats]
chats = [chat for chat in chats if not 'https' in chat]
chat_lower = [w.lower() for w in chats]
# This will strip off unwanted chars like emojis
clean_chat = [w for w in chat_lower if ifPermissible(w)]
# To Convert conversation in model readable format
yml_chat = []
persona_dialogue=[]
for i in range(len(clean_chat)):
chat_line = clean_chat[i]
if (persona_1 in chat_line or persona_2 in chat_line):
# start_list = True
chat_line = chat_line.replace(persona_1, "- - ")
chat_line = chat_line.replace(persona_2, " - ")
yml_chat = yml_chat + persona_dialogue + ['\n' + chat_line]
persona_dialogue = []
else:
persona_dialogue.append(chat_line.strip() + " ")
yml = ''.join(yml_chat)
# To strip of date and time from forward messages.
yml = re.sub('\d+.\d+.\d+|\d+:\d+:\d', '', yml)
with open(filename+".txt", "w", encoding='utf8') as output:
output.write(str(yml))
# iterate the folder to process all files inside.
for file in os.listdir(DATA_DIR_PATH):
filepath = os.path.join(DATA_DIR_PATH, file)
if os.path.isfile(filepath):
parse_html(filepath)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment