Skip to content

Instantly share code, notes, and snippets.

View oliver-batey's full-sized avatar

Oliver Batey oliver-batey

  • Sunderland, United Kingdom
View GitHub Profile
@oliver-batey
oliver-batey / using_common_interface.py
Last active January 20, 2021 23:27
How to use the common interface to parse different file types
import parse_file as dp
#define paths to test files
txt_path = 'test_txt.txt'
docx_path = 'test_docx.docx'
pdf_path = 'test_pdf.pdf'
html_path = 'test_html.html'
pptx_path = 'test_pptx.pptx'
file_paths = [txt_path,docx_path,pdf_path,html_path,pptx_path]
@oliver-batey
oliver-batey / common_interface.py
Last active January 21, 2021 15:58
Common interface for parsing txt, docx, pdf, html and pptx
import os
import io
from docx import Document
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
@oliver-batey
oliver-batey / factory_pattern_part1.py
Last active January 20, 2021 20:27
Part 1 of the factory method pattern
import os
from docx import Document
class DocParser:
def parse(self,document):
parser = get_format(document)
return parser(document)
def get_format(document):
format = os.path.splitext(document)[-1]
@oliver-batey
oliver-batey / bad_file_parsing.py
Last active January 19, 2021 09:42
Example bad file parsing function
def serialise_file(document,format):
if format =='txt':
with open(document, 'r') as file:
string = file.read().replace('\n', ' ')
return string
elif format == 'docx'
#docx parsing code here
@oliver-batey
oliver-batey / sentence_sentiment.py
Last active November 28, 2021 19:59
Mean sentence sentiment
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from textblob import TextBlob
def sentiment_polarity(string: str) -> float:
polarity = TextBlob(string).sentiment[0]
return polarity
@oliver-batey
oliver-batey / filter_language_structures.py
Last active November 28, 2021 18:29
Get relevant SVO patterns and sentences
import numpy as np
import pandas as pd
from textacy import extract, make_spacy_doc
# Load the entire article text
with open("news_article.txt", "r") as file:
data = file.read().replace("\n", "")
article = data.replace(u"\xa0", u" ")
# Create doc object
@oliver-batey
oliver-batey / subject_dependencies.py
Last active November 28, 2021 19:55
Calculate the distance between nodes of dependency network
from itertools import count
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import textacy
with open("news_article.txt", "r") as file:
data = file.read().replace("\n", "")
@oliver-batey
oliver-batey / plot_keyterms.py
Last active November 29, 2021 11:45
Plot keyterms of a document
from typing import List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from textacy import extract, make_spacy_doc
def decompose_keyterms(keyterm_list: List[str]) -> Tuple:
@oliver-batey
oliver-batey / download_images.py
Created December 23, 2020 20:16
Download instagram images
#Scrape profile and get recent posts
natgeo = Profile('natgeo')
natgeo.scrape()
recents = natgeo.get_recent_posts()
#Filter list to separate images from videos
recent_photos = [post for post in recents if not post.is_video]
#Save photos in a loop
for i, post in enumerate(recent_photos):
from instascrape import Hashtag
#Substitute 'ad' with the word you
#want to search for (as a string)
hashtag = Hashtag('ad')
#Scrape the profile
hashtag.scrape()
#Get list of the recent posts
recents = hashtag.get_recent_posts()