Skip to content

Instantly share code, notes, and snippets.

View leebird's full-sized avatar

Gang Li leebird

  • Mountain View, CA
View GitHub Profile
@leebird
leebird / python2_unicode_string.py
Created October 8, 2016 00:49
Test Python 2 str and unicode types
# -*- coding: utf-8 -*-
# Test various characteristics of Unicode string in Python 2.
# In Python 2, we have 2 types to store string data, str and unicode.
# The type str is like a byte string, while the type unicode stores
# unicode codepoints, with each being represented by one or more bytes.
# Define a simple ASCII string, the type is str.
ascii_a = 'abcdefg'
print 'OUTPUT 1'
print type(ascii_a)
@leebird
leebird / split_sent.py
Last active April 8, 2016 03:31
Split sentences using NLTK
''' Initialization: install NLTK python module and download data.
$ pip install nltk
$ echo 'import nltk; nltk.download("punkt")' | python
'''
from __future__ import print_function, unicode_literals
import nltk.data
_sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
@leebird
leebird / mirna_ner_api.py
Created April 6, 2016 18:08
Example of using miRTex miRNA NER API
from __future__ import print_function, unicode_literals
import json
import urllib
import urllib2
# The API URL.
api_url = 'http://research.bioinformatics.udel.edu/miRTex/ner'
# The documents to be processed.
documents = {
@leebird
leebird / tag_and_score.py
Created April 4, 2016 13:42
Scoring miRTex results
# Note that this is not a standalone script. It has dependencies.
# Here it is just used as an example illustrating the scoring
# process for miRTex results.
from __future__ import unicode_literals, print_function
import pickle
import os
import codecs
import sys
import re
@leebird
leebird / converter.py
Last active April 8, 2016 19:41
Simple JSON-to-BioC convertor
from __future__ import unicode_literals
import sys
import codecs
import json
from lxml import etree
# See http://lxml.de/api.html#incremental-xml-generation
# for incremental XML generation used below.
@leebird
leebird / document_pb2.py
Created March 16, 2016 17:23
compiled python codes for document proto
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: document.proto
import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2