Skip to content

Instantly share code, notes, and snippets.

@almugabo
almugabo / example_config.yaml
Last active October 6, 2024 14:25
TorchTune recipe config
## to make it easier to work with the paths in torchtune
path_model_downloaded: /home/mike/_torch_tune/_xmodel_downloaded/MetaLlama-3.1-8B
path_model_finetuned: /home/mike/_torch_tune/_xmodel_finetuned/MetaLlama-3.1-8B_finetuned
# Tokenizer
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
path: ${path_model_downloaded}/original/tokenizer.model
@almugabo
almugabo / wrapper_torchtune.py
Created October 6, 2024 13:48
Torchtune wrappers
## this gist include some scripts to work with Torchtune
from huggingface_hub import snapshot_download
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
from pathlib import Path
def download_model(model_id: str, output_dir: str = None, hf_token: str = None, ignore_patterns: list = ["*.safetensors"]) -> None:
"""
@almugabo
almugabo / compare_tokenizer_gemma_llama.py
Created February 23, 2024 19:06
compare tokenizers of gemma and llama
import os
import pandas as pd
import random
from transformers import AutoTokenizer
xFld = '....' # folder with a bunch of text files
xtok_tl = AutoTokenizer.from_pretrained('unsloth/tinyllama-bnb-4bit')
xtok_ge = AutoTokenizer.from_pretrained('unsloth/gemma-2b-bnb-4bit')
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 23 05:27:37 2024
@author: mike
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
@almugabo
almugabo / reference_extraction.py
Created June 3, 2021 04:29
extraction of references in structured form
"""
FROM: https://gist.github.com/sobolevnrm/412763ebae5424a92d3239898b615e2a
Process RIS format following the standard at",
http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """
import re
ALLOWED_TAGS = {"TY" : "Record start",
"ER" : "Record end",
@almugabo
almugabo / get_photo_unsplash.py
Last active May 31, 2021 08:43
get picture from unsplash
# get photo from unsplash
# depends on python-unsplash
# !pip install python-unsplash
from unsplash.api import Api
from unsplash.auth import Auth
import requests
from PIL import Image
from io import BytesIO
@almugabo
almugabo / resnews_template_word.py
Last active May 29, 2021 08:30
resnews_template_word.py
# create style
from docx import Document
# create style
from docx.enum.style import WD_STYLE_TYPE
from docx.shared import Inches, Pt
## TO DO
## [1] add an image in a cell
@almugabo
almugabo / resnews_templating.py
Created April 30, 2021 10:17
research news template
from string import Template
xDocHead = '''<!DOCTYPE html>
<html><head>
<meta http-equiv="content-type" content="text/html; charset=windows-1252">
<style>
.title_orange {
font-family: Verdana, Geneva, sans-serif;
font-size: 14px;
@almugabo
almugabo / open_patent_services.py
Created May 3, 2019 04:13
open patent services
import requests
from base64 import b64encode
import requests
import json
UrlAuth = 'https://ops.epo.org/3.2/auth/accesstoken'
UrlServiceBase = 'https://ops.epo.org/3.2/rest-services/published-data'
UrlServiceBaseSearch = UrlServiceBase + '/search/biblio/?q='
@almugabo
almugabo / HungarianMethod.py
Created March 24, 2019 19:22
Hungarian Method, wrapper around scipy.optimize.linear_sum_assignment
import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
def make_assignments(xDF):
'''
a simple wrapper around the
scipy.optimize.linear_sum_assignment
which implements the Hungarian Algorithm