Skip to content

Instantly share code, notes, and snippets.

@wupeixian
Created May 3, 2023 20:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wupeixian/117f91652e7ba49d1dcc3bec475f2740 to your computer and use it in GitHub Desktop.
Save wupeixian/117f91652e7ba49d1dcc3bec475f2740 to your computer and use it in GitHub Desktop.
Final Assignment
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "4ce0fba5",
"metadata": {},
"outputs": [],
"source": [
"from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained('distilgpt2')\n",
"model = AutoModelForCausalLM.from_pretrained('distilgpt2')\n",
"generator = pipeline('text-generation', model=model, tokenizer=tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0572d1c2",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import markovify\n",
"import spacy\n",
"import re\n",
"import tracery\n",
"import textwrap"
]
},
{
"cell_type": "markdown",
"id": "997f8fb1",
"metadata": {},
"source": [
"# importing the text from \"The library of the Babel\" and some body related poems"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f948e219",
"metadata": {},
"outputs": [],
"source": [
"text_arch = open(\"thelibraryofthebabel.txt\").read()\n",
"text_body = open(\"Body.txt\").read()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ed4ab878",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" The content was also deciphered: some notions of\n",
"combinative analysis, illustrated with examples of variations with unlimited\n",
"repetition\n"
]
}
],
"source": [
"sentences_arch = text_arch.split('.')\n",
"print(random.choice(sentences_arch))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0b313455",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"much with her? and has she been much\n"
]
}
],
"source": [
"sentences_body = text_body.split(\"\\n\")\n",
"print(random.choice(sentences_body))"
]
},
{
"cell_type": "markdown",
"id": "440fbfc2",
"metadata": {},
"source": [
"# using space the categorize the words used in architecture text"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c672ce4a",
"metadata": {},
"outputs": [],
"source": [
"nlp_arch = spacy.load('en_core_web_md')\n",
"doc_arch = nlp_arch(text_arch)\n",
"sentences_arch = list(doc_arch.sents)\n",
"\n",
"words_arch = [w for w in list(doc_arch) if w.is_alpha]\n",
"noun_chunks_arch = list(doc_arch.noun_chunks)\n",
"entities_arch = list(doc_arch.ents)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "cea45565",
"metadata": {},
"outputs": [],
"source": [
"nouns_arch = [w for w in words_arch if w.pos_ == \"NOUN\"]\n",
"verbs_arch = [w for w in words_arch if w.pos_ == \"VERB\"]\n",
"adjs_arch = [w for w in words_arch if w.pos_ == \"ADJ\"]\n",
"advs_arch = [w for w in words_arch if w.pos_ == \"ADV\"]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "51e39f53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"exception"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(nouns_arch)"
]
},
{
"cell_type": "markdown",
"id": "95f49204",
"metadata": {},
"source": [
"# using spacy to catergorize the words used in Architectural texts"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "fe1fa986",
"metadata": {},
"outputs": [],
"source": [
"nlp_arch = spacy.load('en_core_web_md')\n",
"doc_arch = nlp_arch(text_arch)\n",
"sentences_arch = list(doc_arch.sents)\n",
"\n",
"words_arch = [w for w in list(doc_arch) if w.is_alpha]\n",
"noun_chunks_arch = list(doc_arch.noun_chunks)\n",
"entities_arch = list(doc_arch.ents)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "fc8bae46",
"metadata": {},
"outputs": [],
"source": [
"nouns_arch = [w for w in words_arch if w.pos_ == \"NOUN\"]\n",
"verbs_arch = [w for w in words_arch if w.pos_ == \"VERB\"]\n",
"adjs_arch = [w for w in words_arch if w.pos_ == \"ADJ\"]\n",
"advs_arch = [w for w in words_arch if w.pos_ == \"ADV\"]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "00f23bcc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"task"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(nouns_arch)"
]
},
{
"cell_type": "markdown",
"id": "df24f56f",
"metadata": {},
"source": [
"# using markov model to generate the mix of two texts style"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "99398d5c",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"\"body or another person's body,\""
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(sentences_body)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "0b0be2c6",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"generator_arch = markovify.Text(text_arch)\n",
"generator_body = markovify.Text(text_body)\n",
"length_arch = 500\n",
"length_body = 80\n",
"length_combo = 50\n",
"weight_arch = 0.3\n",
"weight_body = 0.7\n",
"combo = markovify.combine([generator_arch, generator_body], [weight_arch, weight_body])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c2a41255",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"For a century they have exhausted the hexagons ...\n"
]
}
],
"source": [
"print(combo.make_short_sentence(length_combo))"
]
},
{
"cell_type": "markdown",
"id": "dfefe740",
"metadata": {},
"source": [
"# get some markov sentences from the body poems"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "fccf270c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The man's body at auction!\n"
]
}
],
"source": [
"print(generator_body.make_short_sentence(length_body))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "d8d66ebe",
"metadata": {},
"outputs": [],
"source": [
"mk_body = []\n",
"for line in range(4):\n",
" mk_body.append(generator_body.make_short_sentence(length_body))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f9ad6f74",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Have you ever loved the body of man is calming and ex- cellent to the mothers.', 'Whatever the bids of the parts of you!', \"A man's body is sacred�it is no matter who, Is it one of the parts of you!\", 'Is it one of the eye, eye-brows, and the outlet again.']\n"
]
}
],
"source": [
"print(mk_body)"
]
},
{
"cell_type": "markdown",
"id": "5ea783d2",
"metadata": {},
"source": [
"# get some markov sentences from the architecture text and store the nouns"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "09df5dd4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"To the left and right of the same series may have examined and read it.\n"
]
}
],
"source": [
"mk_arch = generator_arch.make_short_sentence(length_arch)\n",
"print(mk_arch)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "c15c9f07",
"metadata": {},
"outputs": [],
"source": [
"nlp_mk_arch = spacy.load('en_core_web_md')\n",
"doc_mk_arch = nlp_mk_arch(mk_arch)\n",
"sentences_mk_arch = list(doc_arch.sents)\n",
"\n",
"words_mk_arch = [w for w in list(doc_mk_arch) if w.is_alpha]\n",
"noun_chunks_mk_arch = list(doc_mk_arch.noun_chunks)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "0bc89440",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"the same series"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(noun_chunks_mk_arch)"
]
},
{
"cell_type": "markdown",
"id": "8bed82fa",
"metadata": {},
"source": [
"# replace the nouns in markov body sentences with nouns from architecture"
]
},
{
"cell_type": "code",
"execution_count": 162,
"id": "6ab275d6",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Have, you, ever, loved, the, body, of, man, is, calming, and, cellent, to, the, mothers]\n"
]
}
],
"source": [
"nlp_line = spacy.load('en_core_web_md')\n",
"doc_line = nlp_line(mk_body[0])\n",
"\n",
"words_line = [w for w in list(doc_line) if w.is_alpha]\n",
"noun_chunks_line = list(doc_line.noun_chunks)\n",
"\n",
"print(words_line)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"id": "37aa91f3",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[you, the body, man, the mothers]\n"
]
}
],
"source": [
"print(noun_chunks_line)\n"
]
},
{
"cell_type": "code",
"execution_count": 164,
"id": "1a3892fc",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['Have',\n",
" 'you',\n",
" 'ever',\n",
" 'loved',\n",
" 'the',\n",
" 'it',\n",
" 'of',\n",
" 'it',\n",
" 'is',\n",
" 'calming',\n",
" 'and',\n",
" 'cellent',\n",
" 'to',\n",
" 'the',\n",
" 'the left']"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for i in range(len(words_line)):\n",
" if words_line[i].pos_ == \"NOUN\":\n",
" words_line[i] = str(random.choice(noun_chunks_mk_arch))\n",
" words_line[i] = str(words_line[i])\n",
"words_line"
]
},
{
"cell_type": "code",
"execution_count": 165,
"id": "6e5db93f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Have you ever loved the it of it is calming and cellent to the the\n",
"left\n"
]
}
],
"source": [
"print(textwrap.fill(\" \".join(words_line)))"
]
},
{
"cell_type": "markdown",
"id": "68f73d92",
"metadata": {},
"source": [
"# Generate a stanza repeating the process above"
]
},
{
"cell_type": "code",
"execution_count": 173,
"id": "1a91b26a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"How do you know who shall be stript that you and he might touch each\n",
"other\n",
"And if the the next-to-last page only but of the the senseless\n",
"perdition of These phrases\n",
"How do you know so much that you may see them\n",
"And if the one time of a well made hope appears not only one the wind\n",
"this is the The original manuscript\n"
]
}
],
"source": [
"mk_body = []\n",
"for line in range(4):\n",
" mk_body.append(generator_body.make_short_sentence(length_body))\n",
" \n",
"mk_arch = generator_arch.make_short_sentence(length_arch)\n",
"nlp_mk_arch = spacy.load('en_core_web_md')\n",
"doc_mk_arch = nlp_mk_arch(mk_arch)\n",
"sentences_mk_arch = list(doc_arch.sents)\n",
"\n",
"words_mk_arch = [w for w in list(doc_mk_arch) if w.is_alpha]\n",
"noun_chunks_mk_arch = list(doc_mk_arch.noun_chunks)\n",
"\n",
"for line in range(4):\n",
" nlp_line = spacy.load('en_core_web_md')\n",
" doc_line = nlp_line(mk_body[line])\n",
"\n",
" words_line = [w for w in list(doc_line) if w.is_alpha]\n",
" noun_chunks_line = list(doc_line.noun_chunks)\n",
"\n",
" for i in range(len(words_line)):\n",
" if words_line[i].pos_ == \"NOUN\":\n",
" words_line[i] = str(random.choice(noun_chunks_arch))\n",
" words_line[i] = str(words_line[i])\n",
" \n",
" print(textwrap.fill(\" \".join(words_line)))"
]
},
{
"cell_type": "code",
"execution_count": 187,
"id": "0764841c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The a few miles is a a delirious divinity\n",
"Within there runs the one the same old beautiful total\n",
"The prophecy these canonical books at they\n",
"In them and of them that pleases the inalterable MCV well\n"
]
}
],
"source": [
"mk_body = []\n",
"for line in range(4):\n",
" mk_body.append(generator_body.make_short_sentence(length_body))\n",
" \n",
"mk_arch = combo.make_short_sentence(100)\n",
"nlp_mk_arch = spacy.load('en_core_web_md')\n",
"doc_mk_arch = nlp_mk_arch(mk_arch)\n",
"sentences_mk_arch = list(doc_arch.sents)\n",
"\n",
"words_mk_arch = [w for w in list(doc_mk_arch) if w.is_alpha]\n",
"noun_chunks_mk_arch = list(doc_mk_arch.noun_chunks)\n",
"\n",
"for line in range(4):\n",
" nlp_line = spacy.load('en_core_web_md')\n",
" doc_line = nlp_line(mk_body[line])\n",
"\n",
" words_line = [w for w in list(doc_line) if w.is_alpha]\n",
" noun_chunks_line = list(doc_line.noun_chunks)\n",
"\n",
" for i in range(len(words_line)):\n",
" if words_line[i].pos_ == \"NOUN\":\n",
" words_line[i] = str(random.choice(noun_chunks_arch))\n",
" words_line[i] = str(words_line[i])\n",
" \n",
" print(textwrap.fill(\" \".join(words_line)))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment