cjue25/01_read.py

## 01_read.py
def read_seq(inputfile):
    """reads and returns the imput sequence with special characters removed."""

    with open(inputfile, 'r') as f:
        seq=f.read()
    seq = seq.replace("\n","")
    seq= seq.replace("\r","")
    return seq

## 02_translate.py
table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}


def translate(seq):
    """Translate DNA sequence"""

    protein=""
    if len(seq)% 3 ==0:
        for i in range(0,len(seq),3):
            codon=seq[i:i+3]
            protein += table[codon]

    return protein


dna=read_seq("dna.txt")
prt=read_seq("protein.txt")

#看準確率
print (prt == translate(dna[20:938])[:-1])
#NCBI再給CDS的時候，給予的DNA段落最後會包含終止符，因此要刪除

## 03_hw1-1.py
import string
alphabet =" "+string.ascii_lowercase

positions ={}
for i in range(27):
    positions[alphabet[i]]=i

message = "hi my name is caesar"
encoded_message = ""

for i in message:
    encoded_message += alphabet[positions[i]+1 % 27]

## 03_hw1-2.py
def encoding(message,key):
    encoding_list = []
    for char in message:
        position = positions[char]
        encoded_position = (position + key) % 27
        encoding_list.append(alphabet[encoded_position])
    encoded_string = "".join(encoding_list)
    return encoded_string

encoded_message=encoding(message,3)

print (encoded_message)

decoded_message = encoding(encoded_message,-3)

# print your decoded message here!
print (decoded_message)

## 04_count_words.py
from collections import Counter
text="This is my test text. We're friends."

def count_words(text):
    text=text.lower()
    skips=[".",",",";",":","'",'"']
    for i in skips:
        text.replace(i,"")

    count_words={}
    for word in text.split(" "):
        if word in count_words:
            count_words[word]+=1
        else:
            count_words[word]=1
    return count_words

def count_words_fast(text):
    text=text.lower()
    skips=[".",",",";",":","'",'"']
    for i in skips:
        text.replace(i,"")

    count_words=Counter(text.split(" "))
    return count_words

print (count_words(text)==count_words_fast(text))

## 05_read_and_stats.py
def read_book(title_path):
    """
    Read a book and return it as a string.
    """
    with open(title_path, "r", encoding="utf8") as current_file:
        text=current_file.read()
        text=text.replace("\n","").replace("\r","")
    return text

def word_stats(word_counts):
    num_unique=len(word_counts)
    counts=word_counts.values()
    return (num_unique, counts)


text=read_book("filepath")
word_counts=count_words(text)
(num_unique, counts)=word_stats(word_counts)
print (num_unique, sum(counts)) ##總共有幾種單字、以及共有幾個單字

## 06_books.py
import os
book_dir="./Books"

import pandas as pd
stats=pd.DataFrame(columns=('language','author','title','length','unique'))
title_num=1

for language in os.listdir(book_dir) : #可以讀路徑下的所有檔名
    for author in os.listdir(book_dir+'/'+language):
        for title in os.listdir(book_dir+'/'+language+'/'+author):
            inputfile=book_dir+'/'+language+'/'+author+'/'+title
            #print(inputfile)
            text=read_book(inputfile)
            (num_unique, counts)=word_stats(count_words(text))
            stats.loc[title_num]=language,author.capitalize(),title.replace(".txt",""),sum(counts),num_unique
            title_num +=1

## 07_plot.py
import matplotlib.pyplot as plt

plt.figure(figsize=(10,10))
subset=stats[stats.language=="English"]
plt.loglog(subset.length,subset.unique,'o',label="English",color="crimson")
subset=stats[stats.language=="French"]
plt.loglog(subset.length,subset.unique,'o',label="French",color="forestgreen")
subset=stats[stats.language=="German"]
plt.loglog(subset.length,subset.unique,'o',label="German",color="orange")
subset=stats[stats.language=="Portuguese"]
plt.loglog(subset.length,subset.unique,'o',label="Portuguese",color="blueviolet")
plt.legend()
plt.xlabel("Book length")
plt.ylabel("Number of unique words")
plt.savefig("lang_plt.png")
	def read_seq(inputfile):
	"""reads and returns the imput sequence with special characters removed."""

	with open(inputfile, 'r') as f:
	seq=f.read()
	seq = seq.replace("\n","")
	seq= seq.replace("\r","")
	return seq
	table = {
	'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
	'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
	'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
	'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
	'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
	'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
	'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
	'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
	'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
	'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
	'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
	'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
	'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
	'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
	'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
	'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
	}


	def translate(seq):
	"""Translate DNA sequence"""

	protein=""
	if len(seq)% 3 ==0:
	for i in range(0,len(seq),3):
	codon=seq[i:i+3]
	protein += table[codon]

	return protein


	dna=read_seq("dna.txt")
	prt=read_seq("protein.txt")

	#看準確率
	print (prt == translate(dna[20:938])[:-1])
	#NCBI再給CDS的時候，給予的DNA段落最後會包含終止符，因此要刪除
	import string
	alphabet =" "+string.ascii_lowercase

	positions ={}
	for i in range(27):
	positions[alphabet[i]]=i

	message = "hi my name is caesar"
	encoded_message = ""

	for i in message:
	encoded_message += alphabet[positions[i]+1 % 27]
	def encoding(message,key):
	encoding_list = []
	for char in message:
	position = positions[char]
	encoded_position = (position + key) % 27
	encoding_list.append(alphabet[encoded_position])
	encoded_string = "".join(encoding_list)
	return encoded_string

	encoded_message=encoding(message,3)

	print (encoded_message)

	decoded_message = encoding(encoded_message,-3)

	# print your decoded message here!
	print (decoded_message)
	from collections import Counter
	text="This is my test text. We're friends."

	def count_words(text):
	text=text.lower()
	skips=[".",",",";",":","'",'"']
	for i in skips:
	text.replace(i,"")

	count_words={}
	for word in text.split(" "):
	if word in count_words:
	count_words[word]+=1
	else:
	count_words[word]=1
	return count_words

	def count_words_fast(text):
	text=text.lower()
	skips=[".",",",";",":","'",'"']
	for i in skips:
	text.replace(i,"")

	count_words=Counter(text.split(" "))
	return count_words

	print (count_words(text)==count_words_fast(text))
	def read_book(title_path):
	"""
	Read a book and return it as a string.
	"""
	with open(title_path, "r", encoding="utf8") as current_file:
	text=current_file.read()
	text=text.replace("\n","").replace("\r","")
	return text

	def word_stats(word_counts):
	num_unique=len(word_counts)
	counts=word_counts.values()
	return (num_unique, counts)


	text=read_book("filepath")
	word_counts=count_words(text)
	(num_unique, counts)=word_stats(word_counts)
	print (num_unique, sum(counts)) ##總共有幾種單字、以及共有幾個單字
	import os
	book_dir="./Books"

	import pandas as pd
	stats=pd.DataFrame(columns=('language','author','title','length','unique'))
	title_num=1

	for language in os.listdir(book_dir) : #可以讀路徑下的所有檔名
	for author in os.listdir(book_dir+'/'+language):
	for title in os.listdir(book_dir+'/'+language+'/'+author):
	inputfile=book_dir+'/'+language+'/'+author+'/'+title
	#print(inputfile)
	text=read_book(inputfile)
	(num_unique, counts)=word_stats(count_words(text))
	stats.loc[title_num]=language,author.capitalize(),title.replace(".txt",""),sum(counts),num_unique
	title_num +=1
	import matplotlib.pyplot as plt

	plt.figure(figsize=(10,10))
	subset=stats[stats.language=="English"]
	plt.loglog(subset.length,subset.unique,'o',label="English",color="crimson")
	subset=stats[stats.language=="French"]
	plt.loglog(subset.length,subset.unique,'o',label="French",color="forestgreen")
	subset=stats[stats.language=="German"]
	plt.loglog(subset.length,subset.unique,'o',label="German",color="orange")
	subset=stats[stats.language=="Portuguese"]
	plt.loglog(subset.length,subset.unique,'o',label="Portuguese",color="blueviolet")
	plt.legend()
	plt.xlabel("Book length")
	plt.ylabel("Number of unique words")
	plt.savefig("lang_plt.png")