Skip to content

Instantly share code, notes, and snippets.

@kalloc
Created April 18, 2023 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kalloc/e13226001c58e5fe43f371d0c98f4d4b to your computer and use it in GitHub Desktop.
Save kalloc/e13226001c58e5fe43f371d0c98f4d4b to your computer and use it in GitHub Desktop.
import os
import sys
import pymysql
import pymysql.cursors
import requests
from lxml import html
DB_PASS = os.environ.get('DB_PASS')
DB_NAME = os.environ.get('DB_NAME')
DB_HOST = os.environ.get('DB_HOST')
DB_USER = os.environ.get('DB_USER')
if not DB_PASS or not DB_NAME or not DB_HOST or not DB_USER:
print('Please set DB_PASS, DB_NAME, DB_HOST, DB_USER')
exit(1)
# Connect to the database
connection = pymysql.connect(host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME,
cursorclass=pymysql.cursors.DictCursor)
def getActualPdfLink(id):
url = f'http://delta.rsl.ru/info/show/mrc/rsl01/{id}'
r = requests.get(url)
body = r.text
tree = html.fromstring(body)
pdf_link = tree.xpath("//span[@class='fieldName'][text()='856']/following-sibling::span[@class='data']/text()")
return pdf_link[1]
def do_update_link(connection, result, actualPdfLink):
print('Updating pdfLink', idFromALIS)
sql = "UPDATE `tbl_common_biblio_card` SET `pdfLink`=%s WHERE `id`=%s"
cursor.execute(sql, (actualPdfLink, result['id']))
sql2 = "INSERT INTO `tbl_indexed` (`FullSymbolicId`, `ALIS`, `is_indexed`) VALUES (%s, %s, false)" + \
" ON DUPLICATE KEY UPDATE `is_indexed`=false";
cursor.execute(sql2, (result['FullSymbolicId'], result['ALIS']))
def do_force_reindex(connection, result):
print('pdfLink is actual, request reindex', idFromALIS)
sql2 = "INSERT INTO `tbl_indexed` (`FullSymbolicId`, `ALIS`, `is_indexed`) VALUES (%s, %s, false)" + \
" ON DUPLICATE KEY UPDATE `is_indexed`=false";
cursor.execute(sql2, (result['FullSymbolicId'], result['ALIS']))
def get_doc_info(connection, idFromALIS):
# Read a single record
sql = "SELECT `id`, `FullSymbolicId`, `ALIS`, `pdfLink` FROM `tbl_common_biblio_card` WHERE `idFromALIS`=%s"
cursor.execute(sql, (idFromALIS,))
result = cursor.fetchone()
return result
with connection:
if len(sys.argv) < 2:
print('Usage: python3 index.py to_be_indexed.txt')
exit(1)
filename = sys.argv[1]
with open('to_be_indexed.txt', 'r') as file:
for idFromALIS in file:
idFromALIS = idFromALIS.strip()
with connection.cursor() as cursor:
result = get_doc_info(connection, idFromALIS)
if not result:
print('No such idFromALIS', idFromALIS)
continue
actualPdfLink = getActualPdfLink(idFromALIS)
if(result['pdfLink'] != actualPdfLink):
do_update_link(connection, result, actualPdfLink)
else:
do_force_reindex(connection, result)
connection.commit()
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment