Marek Šuppa mrshu

## README.md

      
              4 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                mrshu
                / README.md
            
            
              Last active
              January 21, 2024 06:07
            
              
                Check non-200 PDF links
              
          
    Non-200 PDF links checker

Find all .pdf links on a page and check which ones return non-200 response
Install

$ poetry install

Run


## xlsxwriter_write_diff.py
for i, row in df.iterrows():
    worksheet.write_rich_string(i+1, 2, *visualize_diff(row['original'], row['edited']))
excel_writer.save()

## visualize_diff.py
import difflib

def visualize_diff(a, b):
    seqm = difflib.SequenceMatcher(None, a, b)
    output= []
    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == 'equal':
            output.append(seqm.a[a0:a1])
        elif opcode == 'insert':
            output.append(green)

## xlsxwriter_formatting_preparation.py
sheet_name = 'to_review'
excel_writer = pd.ExcelWriter("edited_questions.xlsx", engine='xlsxwriter')
df.to_excel(excel_writer, sheet_name=sheet_name, index=False)

workbook = excel_writer.book
worksheet = excel_writer.sheets[sheet_name]

green   = workbook.add_format({'color': 'green'})
red     = workbook.add_format({'color': 'red'})
b_green = workbook.add_format({'color': 'green', 'bold': True})

## df_sample_questions.py
import pandas as pd
df = pd.DataFrame([
    {
      'original': "Can you tell us a bit more abt how scalable your solution is?",
      'edited': "Can you tell us a bit more about how scalable your solution is?",
    },
    {
      'original': "What will our priorities be for the next quarter?",
      'edited': "What will our priorities be for the next year?",
    },

## xlsxwriter_write_rich_string_sample.py
import xlsxwriter

workbook   = xlsxwriter.Workbook('rich_strings.xlsx')

bold   = workbook.add_format({'bold': True})
italic = workbook.add_format({'italic': True})

worksheet.write_rich_string('A1',
                            'This is ',
                            bold, 'bold',

## lambda_trigger_airflow_dag.py
import logging
import os
import json
import urllib3
import datetime

AIRFLOW_URL = os.environ['AIRFLOW_URL']
DAG_ID = 'my_helpful_dag'
LOG_LEVEL = os.environ.get('LOG_LEVEL', 'info').upper()

## install_pytorch_python27.py
import os
import site
from setuptools.command import easy_install
install_path = os.environ['GLUE_INSTALLATION']
easy_install.main( ["--install-dir", install_path, "torch"] )
reload(site)


import torch
print(torch.__version__)

## csv_to_parquet.py
#!/usr/bin/env python3

import pandas as pd
import sys

pd.read_csv(sys.argv[1]).to_parquet(sys.argv[2])

## athena_run_query_move_results.py
from airflow.models import DAG
from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
from airflow.operators.s3_file_transform_operator import S3FileTransformOperator
from datetime import datetime

class XComEnabledAWSAthenaOperator(AWSAthenaOperator):
    def execute(self, context):
        super(XComEnabledAWSAthenaOperator, self).execute(context)
        # just so that this gets `xcom_push`(ed)
        return self.query_execution_id
	for i, row in df.iterrows():
	worksheet.write_rich_string(i+1, 2, *visualize_diff(row['original'], row['edited']))
	excel_writer.save()
	import difflib

	def visualize_diff(a, b):
	seqm = difflib.SequenceMatcher(None, a, b)
	output= []
	for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
	if opcode == 'equal':
	output.append(seqm.a[a0:a1])
	elif opcode == 'insert':
	output.append(green)
	sheet_name = 'to_review'
	excel_writer = pd.ExcelWriter("edited_questions.xlsx", engine='xlsxwriter')
	df.to_excel(excel_writer, sheet_name=sheet_name, index=False)

	workbook = excel_writer.book
	worksheet = excel_writer.sheets[sheet_name]

	green = workbook.add_format({'color': 'green'})
	red = workbook.add_format({'color': 'red'})
	b_green = workbook.add_format({'color': 'green', 'bold': True})
	import pandas as pd
	df = pd.DataFrame([
	{
	'original': "Can you tell us a bit more abt how scalable your solution is?",
	'edited': "Can you tell us a bit more about how scalable your solution is?",
	},
	{
	'original': "What will our priorities be for the next quarter?",
	'edited': "What will our priorities be for the next year?",
	},
	import xlsxwriter

	workbook = xlsxwriter.Workbook('rich_strings.xlsx')

	bold = workbook.add_format({'bold': True})
	italic = workbook.add_format({'italic': True})

	worksheet.write_rich_string('A1',
	'This is ',
	bold, 'bold',
	import logging
	import os
	import json
	import urllib3
	import datetime

	AIRFLOW_URL = os.environ['AIRFLOW_URL']
	DAG_ID = 'my_helpful_dag'
	LOG_LEVEL = os.environ.get('LOG_LEVEL', 'info').upper()
	import os
	import site
	from setuptools.command import easy_install
	install_path = os.environ['GLUE_INSTALLATION']
	easy_install.main( ["--install-dir", install_path, "torch"] )
	reload(site)


	import torch
	print(torch.__version__)
	#!/usr/bin/env python3

	import pandas as pd
	import sys

	pd.read_csv(sys.argv[1]).to_parquet(sys.argv[2])
	from airflow.models import DAG
	from airflow.contrib.operators.aws_athena_operator import AWSAthenaOperator
	from airflow.operators.s3_file_transform_operator import S3FileTransformOperator
	from datetime import datetime

	class XComEnabledAWSAthenaOperator(AWSAthenaOperator):
	def execute(self, context):
	super(XComEnabledAWSAthenaOperator, self).execute(context)
	# just so that this gets `xcom_push`(ed)
	return self.query_execution_id