Mohammad Mahdi Samiei mmsamiei

## python-rtl-text-handling.py
from bidi import algorithm as bidi_algorithm
import arabic_reshaper
rtl = lambda x : bidi_algorithm.get_display(arabic_reshaper.reshape(x))
rtl('سلام سلام سلام')

## position_embedding.py
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)

## eval
!pip -q install nlp
!pip -q install bert_score

from nlp import load_metric

metric = load_metric("bertscore")


import numpy as np

## bertsco
refrences = []
predictions = []

kwargs = {'num_beams':1,
          'num_return_sequences':1,'temperature':1, 'max_length':50,'early_stopping':True,
          'no_repeat_ngram_size':3,
          'decoder_start_token_id':0,
          'eos_token_id':2
          #'do_sample':True
          }

## bertscore
!pip -q install nlp
!pip -q install bert_score

from nlp import load_metric

metric = load_metric("bertscore")


import numpy as np

## Download Project Dataset
! wget -q "https://drive.google.com/uc?export=download&id=1-3tnHTdDjtMd9O2LgKN2ir3t5KvnqrXI" -O dataset.zip
! unzip dataset.zip

import subprocess
import shlex

file_id = "1xhiGDTihHYUbGES88sYt4S6nLDjKEji1"
file_name = "mscoco.zip"
url_get_cookie = f"https://drive.google.com/uc?export=download&id={file_id}"

## create_feature
df['polarity'] = df['Text'].map(lambda text: textblob.TextBlob(text).sentiment.polarity)
df['review_len'] = df['Text'].astype(str).apply(len)
df['word_count'] = df['Text'].apply(lambda x: len(str(x).split()))

## phase_2_dataset_idea2.py
import json
from pprint import pprint
from tqdm import tqdm

freader = open('test_random_split.json')
data = json.load(freader)

print(len(data))

new_dataset = []

## json to jsonl.py
import json

freader = open('test_random_split.json')
data = json.load(freader)

with open('correct-sample.json', 'w') as
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

## wikipedia_10000_vital_articles.json
[
{"page": "Mathematics"},
{"page": "Mathematician"},
{"page": "Arithmetic"},
{"page": "Addition"},
{"page": "Subtraction"},
{"page": "Multiplication"},
{"page": "Division (mathematics)"},
{"page": "Euclidean algorithm"},
{"page": "Fraction (mathematics)"},
	from bidi import algorithm as bidi_algorithm
	import arabic_reshaper
	rtl = lambda x : bidi_algorithm.get_display(arabic_reshaper.reshape(x))
	rtl('سلام سلام سلام')
	class PositionalEncoding(nn.Module):

	def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
	super().__init__()
	self.dropout = nn.Dropout(p=dropout)

	position = torch.arange(max_len).unsqueeze(1)
	div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
	pe = torch.zeros(max_len, 1, d_model)
	pe[:, 0, 0::2] = torch.sin(position * div_term)
	!pip -q install nlp
	!pip -q install bert_score

	from nlp import load_metric

	metric = load_metric("bertscore")



	import numpy as np
	refrences = []
	predictions = []

	kwargs = {'num_beams':1,
	'num_return_sequences':1,'temperature':1, 'max_length':50,'early_stopping':True,
	'no_repeat_ngram_size':3,
	'decoder_start_token_id':0,
	'eos_token_id':2
	#'do_sample':True
	}
	! wget -q "https://drive.google.com/uc?export=download&id=1-3tnHTdDjtMd9O2LgKN2ir3t5KvnqrXI" -O dataset.zip
	! unzip dataset.zip

	import subprocess
	import shlex

	file_id = "1xhiGDTihHYUbGES88sYt4S6nLDjKEji1"
	file_name = "mscoco.zip"
	url_get_cookie = f"https://drive.google.com/uc?export=download&id={file_id}"
	df['polarity'] = df['Text'].map(lambda text: textblob.TextBlob(text).sentiment.polarity)
	df['review_len'] = df['Text'].astype(str).apply(len)
	df['word_count'] = df['Text'].apply(lambda x: len(str(x).split()))
	import json
	from pprint import pprint
	from tqdm import tqdm

	freader = open('test_random_split.json')
	data = json.load(freader)

	print(len(data))

	new_dataset = []
	[
	{"page": "Mathematics"},
	{"page": "Mathematician"},
	{"page": "Arithmetic"},
	{"page": "Addition"},
	{"page": "Subtraction"},
	{"page": "Multiplication"},
	{"page": "Division (mathematics)"},
	{"page": "Euclidean algorithm"},
	{"page": "Fraction (mathematics)"},