Skip to content

Instantly share code, notes, and snippets.

View thisismattmiller's full-sized avatar
😑
...

Matt Miller thisismattmiller

😑
...
View GitHub Profile
<!DOCTYPE html>
<html lang="en">
<head>
<!--
This is an HTML comment
You can write text in a comment and the content won't be visible in the page
-->
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:bf="http://id.loc.gov/ontologies/bibframe/" xmlns:bflc="http://id.loc.gov/ontologies/bflc/" xmlns:lclocal="http://id.loc.gov/ontologies/lclocal/" xmlns:madsrdf="http://www.loc.gov/mads/rdf/v1#" xmlns:pmo="http://performedmusicontology.org/ontology/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:streams="info:lc/streams#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<bf:Instance rdf:about="http://id.loc.gov/resources/instances/20898769">
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Print"/>
<bf:issuance>
<bf:Issuance rdf:about="http://id.loc.gov/vocabulary/issuance/mono"/>
</bf:issuance>
<bf:provisionActivity>
<bf:ProvisionActivity>
<rdf:type rdf:resource="http://id.loc.gov/ontologies/bibframe/Publication"/>
import requests
import json
source_data = json.load(open('data.json'))
url = 'https://maps.googleapis.com/maps/api/geocode/json'
import requests
import shutil
import camelot.io as camelot
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
for year in range(2011,2017):
url = f'https://files.dep.state.pa.us/Waste/Recycling/RecyclingPortalFiles/Documents/{year}_Recycling_Report.pdf'
@thisismattmiller
thisismattmiller / extract.py
Created March 30, 2022 18:31
Code for https://youtu.be/pwnIcJ9p2C4 Web scraping with selenium
import glob
from bs4 import BeautifulSoup
import json
all_files = list(glob.glob('html/*.html'))
all_data = []
for file_name in all_files:
with open(file_name) as infile:
import glob
import json
urls = {}
for file in glob.glob('data_sogb/*'):
with open(file) as inf:
import waybackpy
urls = [
"http://dmc.signourguestbook.com/?username=dmc&trail=25",
"http://dmc.signourguestbook.com/?username=dmc&trail=50",
"http://dmc.signourguestbook.com/?username=dmc&trail=75",
"http://dmc.signourguestbook.com/?username=dmc&trail=100",
"http://dmc.signourguestbook.com/?username=dmc&trail=125",
"http://dmc.signourguestbook.com/?username=dmc&trail=150",
"http://dmc.signourguestbook.com/?username=dmc&trail=175",
{
"id": "lc:RT:bf2:MIBluRayDVD:Instance",
"propertyTemplates": [
{
"mandatory": "false",
"propertyLabel": "Instance Of",
"propertyURI": "http://id.loc.gov/ontologies/bibframe/instanceOf",
"repeatable": "false",
"resourceTemplates": [],
"type": "resource",
{
"russcarnahan.com": 16,
"secure.actblue.com": 13,
"secure.piryx.com": 6,
"services.myngp.com": 5,
"rickperry.org": 5,
"secure.mydccc.org": 5,
"markleyva.com": 4,
"clyburnforcongress.com": 4,
"johnsprattforcongress.com": 4,
<div><strong>imdb_id1</strong>: <a href="/movie/<%=imdb_id1%>"><%=imdb_id1%></a></div>
<div><strong>color1</strong>: <%=color1%></div>
<div><strong>director_name1</strong>: <%=director_name1%></div>
<div><strong>num_critic_for_reviews1</strong>: <%=num_critic_for_reviews1%></div>
<div><strong>duration1</strong>: <%=duration1%></div>
<div><strong>director_facebook_likes1</strong>: <%=director_facebook_likes1%></div>
<div><strong>actor_3_facebook_likes1</strong>: <%=actor_3_facebook_likes1%></div>
<div><strong>actor_2_name1</strong>: <%=actor_2_name1%></div>
<div><strong>actor_1_facebook_likes1</strong>: <%=actor_1_facebook_likes1%></div>
<div><strong>gross1</strong>: <%=gross1%></div>