Find all .pdf links on a page and check which ones return non-200 response
$ poetry install
for i, row in df.iterrows(): | |
worksheet.write_rich_string(i+1, 2, *visualize_diff(row['original'], row['edited'])) | |
excel_writer.save() |
import difflib | |
def visualize_diff(a, b): | |
seqm = difflib.SequenceMatcher(None, a, b) | |
output= [] | |
for opcode, a0, a1, b0, b1 in seqm.get_opcodes(): | |
if opcode == 'equal': | |
output.append(seqm.a[a0:a1]) | |
elif opcode == 'insert': | |
output.append(green) |
sheet_name = 'to_review' | |
excel_writer = pd.ExcelWriter("edited_questions.xlsx", engine='xlsxwriter') | |
df.to_excel(excel_writer, sheet_name=sheet_name, index=False) | |
workbook = excel_writer.book | |
worksheet = excel_writer.sheets[sheet_name] | |
green = workbook.add_format({'color': 'green'}) | |
red = workbook.add_format({'color': 'red'}) | |
b_green = workbook.add_format({'color': 'green', 'bold': True}) |
import pandas as pd | |
df = pd.DataFrame([ | |
{ | |
'original': "Can you tell us a bit more abt how scalable your solution is?", | |
'edited': "Can you tell us a bit more about how scalable your solution is?", | |
}, | |
{ | |
'original': "What will our priorities be for the next quarter?", | |
'edited': "What will our priorities be for the next year?", | |
}, |
import xlsxwriter | |
workbook = xlsxwriter.Workbook('rich_strings.xlsx') | |
bold = workbook.add_format({'bold': True}) | |
italic = workbook.add_format({'italic': True}) | |
worksheet.write_rich_string('A1', | |
'This is ', | |
bold, 'bold', |
import logging | |
import os | |
import json | |
import urllib3 | |
import datetime | |
AIRFLOW_URL = os.environ['AIRFLOW_URL'] | |
DAG_ID = 'my_helpful_dag' | |
LOG_LEVEL = os.environ.get('LOG_LEVEL', 'info').upper() |
import os | |
import site | |
from setuptools.command import easy_install | |
install_path = os.environ['GLUE_INSTALLATION'] | |
easy_install.main( ["--install-dir", install_path, "torch"] ) | |
reload(site) | |
import torch | |
print(torch.__version__) |
#!/usr/bin/env python3 | |
import pandas as pd | |
import sys | |
pd.read_csv(sys.argv[1]).to_parquet(sys.argv[2]) |
from airflow.models import DAG | |
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator | |
from airflow.operators.s3_file_transform_operator import S3FileTransformOperator | |
from datetime import datetime | |
class XComEnabledAWSAthenaOperator(AWSAthenaOperator): | |
def execute(self, context): | |
super(XComEnabledAWSAthenaOperator, self).execute(context) | |
# just so that this gets `xcom_push`(ed) | |
return self.query_execution_id |