Skip to content

Instantly share code, notes, and snippets.

@gtrabanco
Last active May 23, 2023 00:54
Show Gist options
  • Save gtrabanco/9c89a8f7be106290575ff55a40e405e0 to your computer and use it in GitHub Desktop.
Save gtrabanco/9c89a8f7be106290575ff55a40e405e0 to your computer and use it in GitHub Desktop.
Idea to parse document
import { genericMapToType } from './generic-map-to-type.ts';
export type ScraperSetter = (property: string, value?: any) => void;
type Key = Exclude<any, Function>;
type Value = Exclude<any, Function>;
export type ScrapeHandler = {
selector: string;
handler: HTMLRewriterTypes.HTMLRewriterElementContentHandlers;
};
export type ScrapeHandleElement = (setter: Map<Key, Value>) => ScrapeHandler;
export type ScrapeHandleMultipleElements = (
setter: Map<Key, Map<Key, Value>>
) => Array<ScrapeHandler> | ScrapeHandler;
export class KVScrapeStorage extends Map<Key, Value> {
constructor() {
super();
}
set(key: Key, value: Value | ((prev?: Value) => Value)): this {
const current = this.get(key) ?? undefined;
const newValue = value instanceof Function ? value(current) : value;
return super.set(key, newValue);
}
get(key: string): Map<Key, Value> | undefined {
return super.get(key);
}
}
// Next class is for internal use in HTMLRewriterScrapeDocument
class DocumentKVStorage extends Map<Key, Value> {
constructor() {
super();
}
set(key: Key, value: Value | ((prev?: Value) => Value)): this {
const current = this.get(key) ?? undefined;
const newValue = value instanceof Function ? value(current) : value;
return super.set(key, newValue);
}
get(key: string): Map<Key, Value> | undefined {
return super.get(key);
}
}
type MaptToTypeHandler = (result: Map<Key, Value>) => any;
export class HTMLRewriterScrapeDocument extends HTMLRewriter {
private kv = new DocumentKVStorage();
private mapToType: {
[key: string]: typeof genericMapToType;
} = {};
constructor() {
super();
}
public addHandlers(
alias: string,
handlers:
| Array<ScrapeHandleElement>
| ScrapeHandleElement
| ScrapeHandleMultipleElements,
mapToType?: MaptToTypeHandler
) {
this.kv.set(alias, new KVScrapeStorage());
const setter = this.kv.get(alias)!;
if (mapToType) {
this.mapToType[alias] = mapToType;
}
if (Array.isArray(handlers)) {
handlers.forEach((handle: ScrapeHandleElement) => {
const { selector, handler: handleParseElement } = handle(setter);
super.on(selector, handleParseElement);
});
}
if (handlers instanceof Function) {
const handleParsing = handlers(setter);
if (Array.isArray(handleParsing)) {
handleParsing.forEach(({ selector, handler: handleParseElement }) =>
super.on(selector, handleParseElement)
);
} else {
super.on(handleParsing.selector, handleParsing.handler);
}
}
return this;
}
scrape(response: Response) {
super.transform(response);
const keys = this.kv.keys();
const result = {} as { [key: string]: any };
for (const key of keys) {
const value = this.kv.get(key);
if (value) {
result[key] = this.mapToType[key] ? this.mapToType[key](value) : value;
}
}
return result;
}
}
// HOW TO PARSE A PAGE
// Selectors
const currentSelectedFederationIdSelect = '#territorial > option[selected]';
const currentPageFederationHeaderAElementSelector =
'#cabecera > div:nth-child(2) > a:nth-child(1)';
const currentPageFederationImgElementSelector =
'#cabecera > div:nth-child(2) > a:nth-child(1) > img';
// Getting the info from those selectors in the page
export const currentFederationInfo = (kv: KVScrapeStorage) => [
{
// Federation id and name
selector: currentSelectedFederationIdSelect,
handler: {
element: (element: HTMLRewriterTypes.Element) => {
const key = 'rfebmId';
const value = Number(element.getAttribute('value')) ?? -1;
kv.set(key, value);
},
text: ({ text }: HTMLRewriterTypes.Text) => {
const key = 'name';
const value = text.trim();
kv.set(key, (prev?: string) => (prev ?? '') + value);
},
},
},
{
// Url to the federation website
selector: currentPageFederationHeaderAElementSelector,
handler: {
element: (element: HTMLRewriterTypes.Element) => {
const key = 'url';
const urlString = element.getAttribute('href');
kv.set(key, urlString);
},
},
},
{
// Url to the federation shield
selector: currentPageFederationImgElementSelector,
handler: {
element: (element: HTMLRewriterTypes.Element) => {
const key = 'shieldUrl';
const imgUrl = element.getAttribute('src');
kv.set(key, imgUrl);
},
},
},
];
// Parse <select> of all federations
const federationsSelector = '#territorial > option';
function allFederationsInfo(kv: KVScrapeStorage) {
let currentNumber = -1;
return [
{
selector: federationsSelector,
handler: {
element: (element: HTMLRewriterTypes.Element) => {
currentNumber = Number(element.getAttribute('value')) ?? -1;
},
text: ({ text }: HTMLRewriterTypes.Text) => {
const name = text.trim();
if (name.length > 0) {
kv.set(currentNumber, name);
}
},
},
},
];
}
// How to use all above:
// const rw = new HTMLRewriterScrapeDocument();
// rw.addHandlers('currentFederation', federationInfoFromHeaderHandle);
// rw.addHandlers('allFederations', allFederationsInfo);
// const response = await fetch(
// 'https://www.rfebm.com/competiciones/competicion.php'
// );
// const result = rw.scrape(response);
// console.log(result);
@gtrabanco
Copy link
Author

You can add multiple handlers and will provide you all the data in the aliases you give.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment