Crystina Xinyu Zhang crystina-z

## git-lfs-install.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                crystina-z
                / git-lfs-install.md
            
            
              Created
              April 28, 2024 02:43
                — forked from pourmand1376/git-lfs-install.md
            
              
                Single User Installation of Git-LFS without sudo
              
          
    I wanted to install GIT-Lfs on my user account without access to server root account. I write this to my future self.

Download tar.gz file from git-lfs website.

wget https://github.com/git-lfs/git-lfs/releases/download/v3.2.0/git-lfs-linux-amd64-v3.2.0.tar.gz
2.Untar it
tar xvf git-lfs-linux-amd64-v3.2.0.tar.gz

  
## index_individual.sh
# This file index each language independently,
# using `bert-base-multilingual-cased` tokenizer

cur_file_dir=$( dirname $( realpath $0 ) )

mkdir -p data/individual/

# The collection can be downloaded from:
# Mr. TyDi: https://huggingface.co/datasets/castorini/mr-tydi-corpus
# MIRACL: https://huggingface.co/datasets/miracl/miracl-corpus

## sample_training_neg_prep.py
def convert_train_queries(topic_tsv, corpus_json_fn, qrel_fn, runfile, output_json_fn):
    """
    topic_tsv format: "qid\tquery\n"
    corpus_json_fn format: "{docid: ..., text: ...}"
    qrel_fn format: qid Q0 docid label
    runfile format: qid Q0 docid rank score tag
    """
    runs = load_runs(runfile)
    qrels = load_qrels(qrel_fn)

## prepare_pretokenized_collection.py
import os
import time
import json
import multiprocessing
from argparse import ArgumentParser


from tqdm import tqdm
from transformers import AutoTokenizer

## README.md

      
              7 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                crystina-z
                / README.md
            
            
              Last active
              December 5, 2020 18:32
            
              
                set up capreolus on cc
              
          
    The setup file assume the users have anaconda or miniconda installed.
tl;dr
dir=`$HOME/setup_capr`  # don't remove this directory
mkdir -p $dir
cd $dir
# todo: download this gist and unzip
sh setup.sh


## run.sh
# to use this script:
# sh run.sh 2
# where 2 can be replaced by 3, 4

python run_covid.py --round $1 --udel all --useprevqrels True --index all

## anserini_tok.py
import jnius_config

path_to_anserini_fat_jar = ""  # better to be previous release version? the latest one updated a ton of api
jnius_config.set_classpath(path_to_anserini_fat_jar)

# after config
from jnius import autoclass
stemmer = "porter"
analyzer = autoclass("io.anserini.analysis.EnglishStemmingAnalyzer")(stemmer)
tokenize = autoclass("io.anserini.analysis.AnalyzerUtils").tokenize

## pt2tf.py
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software

## inter.py
'''
Interpolate the results from capreolus NIR models with the bm25+rm3

run:
  python inter.py
    --train 'f1:f2:f5'
    --model_path '$CAPR_PATH/BertKNRM_batch-2_jointbert-True_lesslabel-True_maxdoclen-800_maxqlen-6_nbatches-256_niters-100_normalpretrainepoch-100_onlyonepos-True_predictalways-True_runqrel-True_samplemode-doc_sampleratio-0.01_softmaxloss-True'
    --five_fold_path '/home/x978zhan/tmp/interpolate_data/5_folds'

expected output:

## interpolate.sh
five_fold_path='/home/x978zhan/tmp/interpolate_data/5_folds'

rerank_path='$RESULT_PATH/BertKNRM_batch-2_jointbert-True_lesslabel-True_maxdoclen-800_maxqlen-6_nbatches-256_niters-100_normalpretrainepoch-100_onlyonepos-True_predictalways-True_runqrel-True_samplemode-doc_sampleratio-0.01_softmaxloss-True'

# train_fold='f1:f2:f3'
# dev_fold='f4'
# test_fold='f5'

train_fold='f1:f2:f5'
dev_fold='f3'
	# This file index each language independently,
	# using `bert-base-multilingual-cased` tokenizer

	cur_file_dir=$( dirname $( realpath $0 ) )

	mkdir -p data/individual/

	# The collection can be downloaded from:
	# Mr. TyDi: https://huggingface.co/datasets/castorini/mr-tydi-corpus
	# MIRACL: https://huggingface.co/datasets/miracl/miracl-corpus
	def convert_train_queries(topic_tsv, corpus_json_fn, qrel_fn, runfile, output_json_fn):
	"""
	topic_tsv format: "qid\tquery\n"
	corpus_json_fn format: "{docid: ..., text: ...}"
	qrel_fn format: qid Q0 docid label
	runfile format: qid Q0 docid rank score tag
	"""
	runs = load_runs(runfile)
	qrels = load_qrels(qrel_fn)
	import os
	import time
	import json
	import multiprocessing
	from argparse import ArgumentParser


	from tqdm import tqdm
	from transformers import AutoTokenizer
	# to use this script:
	# sh run.sh 2
	# where 2 can be replaced by 3, 4

	python run_covid.py --round $1 --udel all --useprevqrels True --index all
	import jnius_config

	path_to_anserini_fat_jar = "" # better to be previous release version? the latest one updated a ton of api
	jnius_config.set_classpath(path_to_anserini_fat_jar)

	# after config
	from jnius import autoclass
	stemmer = "porter"
	analyzer = autoclass("io.anserini.analysis.EnglishStemmingAnalyzer")(stemmer)
	tokenize = autoclass("io.anserini.analysis.AnalyzerUtils").tokenize
	# coding=utf-8
	# Copyright 2018 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	'''
	Interpolate the results from capreolus NIR models with the bm25+rm3

	run:
	python inter.py
	--train 'f1:f2:f5'
	--model_path '$CAPR_PATH/BertKNRM_batch-2_jointbert-True_lesslabel-True_maxdoclen-800_maxqlen-6_nbatches-256_niters-100_normalpretrainepoch-100_onlyonepos-True_predictalways-True_runqrel-True_samplemode-doc_sampleratio-0.01_softmaxloss-True'
	--five_fold_path '/home/x978zhan/tmp/interpolate_data/5_folds'

	expected output:
	five_fold_path='/home/x978zhan/tmp/interpolate_data/5_folds'

	rerank_path='$RESULT_PATH/BertKNRM_batch-2_jointbert-True_lesslabel-True_maxdoclen-800_maxqlen-6_nbatches-256_niters-100_normalpretrainepoch-100_onlyonepos-True_predictalways-True_runqrel-True_samplemode-doc_sampleratio-0.01_softmaxloss-True'

	# train_fold='f1:f2:f3'
	# dev_fold='f4'
	# test_fold='f5'

	train_fold='f1:f2:f5'
	dev_fold='f3'