This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import parse_file as dp | |
#define paths to test files | |
txt_path = 'test_txt.txt' | |
docx_path = 'test_docx.docx' | |
pdf_path = 'test_pdf.pdf' | |
html_path = 'test_html.html' | |
pptx_path = 'test_pptx.pptx' | |
file_paths = [txt_path,docx_path,pdf_path,html_path,pptx_path] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import io | |
from docx import Document | |
from pdfminer3.layout import LAParams, LTTextBox | |
from pdfminer3.pdfpage import PDFPage | |
from pdfminer3.pdfinterp import PDFResourceManager | |
from pdfminer3.pdfinterp import PDFPageInterpreter | |
from pdfminer3.converter import PDFPageAggregator | |
from pdfminer3.converter import TextConverter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from docx import Document | |
class DocParser: | |
def parse(self,document): | |
parser = get_format(document) | |
return parser(document) | |
def get_format(document): | |
format = os.path.splitext(document)[-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def serialise_file(document,format): | |
if format =='txt': | |
with open(document, 'r') as file: | |
string = file.read().replace('\n', ' ') | |
return string | |
elif format == 'docx' | |
#docx parsing code here | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from textblob import TextBlob | |
def sentiment_polarity(string: str) -> float: | |
polarity = TextBlob(string).sentiment[0] | |
return polarity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from textacy import extract, make_spacy_doc | |
# Load the entire article text | |
with open("news_article.txt", "r") as file: | |
data = file.read().replace("\n", "") | |
article = data.replace(u"\xa0", u" ") | |
# Create doc object |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import count | |
import matplotlib.pyplot as plt | |
import networkx as nx | |
import numpy as np | |
import pandas as pd | |
import textacy | |
with open("news_article.txt", "r") as file: | |
data = file.read().replace("\n", "") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List, Tuple | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from matplotlib.axes import Axes | |
from textacy import extract, make_spacy_doc | |
def decompose_keyterms(keyterm_list: List[str]) -> Tuple: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Scrape profile and get recent posts | |
natgeo = Profile('natgeo') | |
natgeo.scrape() | |
recents = natgeo.get_recent_posts() | |
#Filter list to separate images from videos | |
recent_photos = [post for post in recents if not post.is_video] | |
#Save photos in a loop | |
for i, post in enumerate(recent_photos): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from instascrape import Hashtag | |
#Substitute 'ad' with the word you | |
#want to search for (as a string) | |
hashtag = Hashtag('ad') | |
#Scrape the profile | |
hashtag.scrape() | |
#Get list of the recent posts | |
recents = hashtag.get_recent_posts() |
NewerOlder