Skip to content

Instantly share code, notes, and snippets.

@mrshu
mrshu / README.md
Last active January 21, 2024 06:07
Check non-200 PDF links

Non-200 PDF links checker

Find all .pdf links on a page and check which ones return non-200 response

Install

$ poetry install

Run

for i, row in df.iterrows():
worksheet.write_rich_string(i+1, 2, *visualize_diff(row['original'], row['edited']))
excel_writer.save()
import difflib
def visualize_diff(a, b):
seqm = difflib.SequenceMatcher(None, a, b)
output= []
for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
if opcode == 'equal':
output.append(seqm.a[a0:a1])
elif opcode == 'insert':
output.append(green)
sheet_name = 'to_review'
excel_writer = pd.ExcelWriter("edited_questions.xlsx", engine='xlsxwriter')
df.to_excel(excel_writer, sheet_name=sheet_name, index=False)
workbook = excel_writer.book
worksheet = excel_writer.sheets[sheet_name]
green = workbook.add_format({'color': 'green'})
red = workbook.add_format({'color': 'red'})
b_green = workbook.add_format({'color': 'green', 'bold': True})
@mrshu
mrshu / df_sample_questions.py
Last active December 23, 2020 18:51
A sample DataFrame with original and edited questions
import pandas as pd
df = pd.DataFrame([
{
'original': "Can you tell us a bit more abt how scalable your solution is?",
'edited': "Can you tell us a bit more about how scalable your solution is?",
},
{
'original': "What will our priorities be for the next quarter?",
'edited': "What will our priorities be for the next year?",
},
@mrshu
mrshu / xlsxwriter_write_rich_string_sample.py
Last active December 23, 2020 18:20
An example of using the `write_rich_string` function
import xlsxwriter
workbook = xlsxwriter.Workbook('rich_strings.xlsx')
bold = workbook.add_format({'bold': True})
italic = workbook.add_format({'italic': True})
worksheet.write_rich_string('A1',
'This is ',
bold, 'bold',
import logging
import os
import json
import urllib3
import datetime
AIRFLOW_URL = os.environ['AIRFLOW_URL']
DAG_ID = 'my_helpful_dag'
LOG_LEVEL = os.environ.get('LOG_LEVEL', 'info').upper()
@mrshu
mrshu / install_pytorch_python27.py
Created August 7, 2019 14:49
Install Pytorch on AWS Glue
import os
import site
from setuptools.command import easy_install
install_path = os.environ['GLUE_INSTALLATION']
easy_install.main( ["--install-dir", install_path, "torch"] )
reload(site)
import torch
print(torch.__version__)
#!/usr/bin/env python3
import pandas as pd
import sys
pd.read_csv(sys.argv[1]).to_parquet(sys.argv[2])
from airflow.models import DAG
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators.s3_file_transform_operator import S3FileTransformOperator
from datetime import datetime
class XComEnabledAWSAthenaOperator(AWSAthenaOperator):
def execute(self, context):
super(XComEnabledAWSAthenaOperator, self).execute(context)
# just so that this gets `xcom_push`(ed)
return self.query_execution_id