Last active
August 29, 2015 14:05
-
-
Save Khalefa/4dda1c043ecf674aee86 to your computer and use it in GitHub Desktop.
DBLP parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!ELEMENT dblp (article|inproceedings|proceedings|book|incollection| | |
phdthesis|mastersthesis|www)*> | |
<!ENTITY % field "author|editor|title|booktitle|pages|year|address|journal|volume|number|month|url|ee|cdrom|cite|publisher|note|crossref|isbn|series|school|chapter"> | |
<!ELEMENT article (%field;)*> | |
<!ATTLIST article | |
key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
reviewid CDATA #IMPLIED | |
rating CDATA #IMPLIED | |
> | |
<!ELEMENT inproceedings (%field;)*> | |
<!ATTLIST inproceedings key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT proceedings (%field;)*> | |
<!ATTLIST proceedings key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT book (%field;)*> | |
<!ATTLIST book key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT incollection (%field;)*> | |
<!ATTLIST incollection key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT phdthesis (%field;)*> | |
<!ATTLIST phdthesis key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT mastersthesis (%field;)*> | |
<!ATTLIST mastersthesis key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT www (%field;)*> | |
<!ATTLIST www key CDATA #REQUIRED | |
mdate CDATA #IMPLIED | |
publtype CDATA #IMPLIED | |
> | |
<!ELEMENT author (#PCDATA)> | |
<!ATTLIST author | |
bibtex CDATA #IMPLIED | |
> | |
<!ELEMENT editor (#PCDATA)> | |
<!ELEMENT address (#PCDATA)> | |
<!ENTITY % titlecontents "#PCDATA|sub|sup|i|tt|ref"> | |
<!ELEMENT title (%titlecontents;)*> | |
<!ATTLIST title | |
bibtex CDATA #IMPLIED | |
> | |
<!ELEMENT booktitle (#PCDATA)> | |
<!ELEMENT pages (#PCDATA)> | |
<!ELEMENT year (#PCDATA)> | |
<!ELEMENT journal (#PCDATA)> | |
<!ELEMENT volume (#PCDATA)> | |
<!ELEMENT number (#PCDATA)> | |
<!ELEMENT month (#PCDATA)> | |
<!ELEMENT url (#PCDATA)> | |
<!ELEMENT ee (#PCDATA)> | |
<!ELEMENT cite (#PCDATA)> | |
<!ELEMENT school (#PCDATA)> | |
<!ELEMENT publisher (#PCDATA)> | |
<!ATTLIST publisher | |
href CDATA #IMPLIED | |
> | |
<!ELEMENT note (#PCDATA)> | |
<!ATTLIST note | |
type CDATA #IMPLIED | |
> | |
<!ELEMENT cdrom (#PCDATA)> | |
<!ATTLIST cite | |
label CDATA #IMPLIED | |
> | |
<!ELEMENT crossref (#PCDATA)> | |
<!ELEMENT isbn (#PCDATA)> | |
<!ELEMENT chapter (#PCDATA)> | |
<!ELEMENT series (#PCDATA)> | |
<!ATTLIST series | |
href CDATA #IMPLIED | |
> | |
<!ELEMENT layout ANY> | |
<!ATTLIST layout | |
logo CDATA #IMPLIED | |
> | |
<!ELEMENT ref (#PCDATA)> | |
<!ATTLIST ref href CDATA #REQUIRED> | |
<!ELEMENT sup (%titlecontents;)*> | |
<!ELEMENT sub (%titlecontents;)*> | |
<!ELEMENT i (%titlecontents;)*> | |
<!ELEMENT tt (%titlecontents;)*> | |
<!ENTITY reg "®"> | |
<!ENTITY micro "µ"> | |
<!ENTITY times "×"> | |
<!-- (C) International Organization for Standardization 1986 | |
Permission to copy in any form is granted for use with | |
conforming SGML systems and applications as defined in | |
ISO 8879, provided this notice is included in all copies. | |
--> | |
<!-- Character entity set. Typical invocation: | |
<!ENTITY % HTMLlat1 PUBLIC | |
"ISO 8879-1986//ENTITIES Added Latin 1//EN//XML"> | |
--> | |
<!-- This version of the entity set can be used with any SGML document | |
which uses ISO 8859-1 or ISO 10646 as its document character | |
set. This includes XML documents and ISO HTML documents. | |
--> | |
<!ENTITY Agrave "À" ><!-- capital A, grave accent --> | |
<!ENTITY Aacute "Á" ><!-- capital A, acute accent --> | |
<!ENTITY Acirc "Â" ><!-- capital A, circumflex accent --> | |
<!ENTITY Atilde "Ã" ><!-- capital A, tilde --> | |
<!ENTITY Auml "Ä" ><!-- capital A, dieresis or umlaut mark --> | |
<!ENTITY Aring "Å" ><!-- capital A, ring --> | |
<!ENTITY AElig "Æ" ><!-- capital AE diphthong (ligature) --> | |
<!ENTITY Ccedil "Ç" ><!-- capital C, cedilla --> | |
<!ENTITY Egrave "È" ><!-- capital E, grave accent --> | |
<!ENTITY Eacute "É" ><!-- capital E, acute accent --> | |
<!ENTITY Ecirc "Ê" ><!-- capital E, circumflex accent --> | |
<!ENTITY Euml "Ë" ><!-- capital E, dieresis or umlaut mark --> | |
<!ENTITY Igrave "Ì" ><!-- capital I, grave accent --> | |
<!ENTITY Iacute "Í" ><!-- capital I, acute accent --> | |
<!ENTITY Icirc "Î" ><!-- capital I, circumflex accent --> | |
<!ENTITY Iuml "Ï" ><!-- capital I, dieresis or umlaut mark --> | |
<!ENTITY ETH "Ð" ><!-- capital Eth, Icelandic --> | |
<!ENTITY Ntilde "Ñ" ><!-- capital N, tilde --> | |
<!ENTITY Ograve "Ò" ><!-- capital O, grave accent --> | |
<!ENTITY Oacute "Ó" ><!-- capital O, acute accent --> | |
<!ENTITY Ocirc "Ô" ><!-- capital O, circumflex accent --> | |
<!ENTITY Otilde "Õ" ><!-- capital O, tilde --> | |
<!ENTITY Ouml "Ö" ><!-- capital O, dieresis or umlaut mark --> | |
<!ENTITY Oslash "Ø" ><!-- capital O, slash --> | |
<!ENTITY Ugrave "Ù" ><!-- capital U, grave accent --> | |
<!ENTITY Uacute "Ú" ><!-- capital U, acute accent --> | |
<!ENTITY Ucirc "Û" ><!-- capital U, circumflex accent --> | |
<!ENTITY Uuml "Ü" ><!-- capital U, dieresis or umlaut mark --> | |
<!ENTITY Yacute "Ý" ><!-- capital Y, acute accent --> | |
<!ENTITY THORN "Þ" ><!-- capital THORN, Icelandic --> | |
<!ENTITY szlig "ß" ><!-- small sharp s, German (sz ligature) --> | |
<!ENTITY agrave "à" ><!-- small a, grave accent --> | |
<!ENTITY aacute "á" ><!-- small a, acute accent --> | |
<!ENTITY acirc "â" ><!-- small a, circumflex accent --> | |
<!ENTITY atilde "ã" ><!-- small a, tilde --> | |
<!ENTITY auml "ä" ><!-- small a, dieresis or umlaut mark --> | |
<!ENTITY aring "å" ><!-- small a, ring --> | |
<!ENTITY aelig "æ" ><!-- small ae diphthong (ligature) --> | |
<!ENTITY ccedil "ç" ><!-- small c, cedilla --> | |
<!ENTITY egrave "è" ><!-- small e, grave accent --> | |
<!ENTITY eacute "é" ><!-- small e, acute accent --> | |
<!ENTITY ecirc "ê" ><!-- small e, circumflex accent --> | |
<!ENTITY euml "ë" ><!-- small e, dieresis or umlaut mark --> | |
<!ENTITY igrave "ì" ><!-- small i, grave accent --> | |
<!ENTITY iacute "í" ><!-- small i, acute accent --> | |
<!ENTITY icirc "î" ><!-- small i, circumflex accent --> | |
<!ENTITY iuml "ï" ><!-- small i, dieresis or umlaut mark --> | |
<!ENTITY eth "ð" ><!-- small eth, Icelandic --> | |
<!ENTITY ntilde "ñ" ><!-- small n, tilde --> | |
<!ENTITY ograve "ò" ><!-- small o, grave accent --> | |
<!ENTITY oacute "ó" ><!-- small o, acute accent --> | |
<!ENTITY ocirc "ô" ><!-- small o, circumflex accent --> | |
<!ENTITY otilde "õ" ><!-- small o, tilde --> | |
<!ENTITY ouml "ö" ><!-- small o, dieresis or umlaut mark --> | |
<!ENTITY oslash "ø" ><!-- small o, slash --> | |
<!ENTITY ugrave "ù" ><!-- small u, grave accent --> | |
<!ENTITY uacute "ú" ><!-- small u, acute accent --> | |
<!ENTITY ucirc "û" ><!-- small u, circumflex accent --> | |
<!ENTITY uuml "ü" ><!-- small u, dieresis or umlaut mark --> | |
<!ENTITY yacute "ý" ><!-- small y, acute accent --> | |
<!ENTITY thorn "þ" ><!-- small thorn, Icelandic --> | |
<!ENTITY yuml "ÿ" ><!-- small y, dieresis or umlaut mark --> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
using System.Xml; | |
using System.Xml.Linq; | |
using System.IO; | |
namespace dblp | |
{ | |
class Program | |
{ | |
static string clean_str(string str) | |
{ | |
char[] arr = str.ToLower().Where(c =>(char.IsLetterOrDigit(c) || | |
char.IsWhiteSpace(c)) | |
).ToArray(); | |
for (int i =0;i< arr.Length;i++) | |
{ | |
char c=arr[i]; | |
if (char.IsWhiteSpace(c)) | |
arr[i] = '_'; | |
} | |
return new string(arr); | |
} | |
static void Main(string[] args) | |
{ | |
string f = @"C:\Users\khalefa\SkyDrive\Alex Work\Work\Edit Distance\datasets\srcs\dblp.xml\dblp.xml"; | |
f=@"dblp.xml"; | |
//XmlDocument doc = new XmlDocument(); | |
XmlReaderSettings settings = new XmlReaderSettings(); | |
settings.ProhibitDtd = false; | |
settings.ValidationType = ValidationType.None; | |
settings.CheckCharacters = false; | |
settings.IgnoreWhitespace = true; | |
settings.IgnoreComments = true; | |
settings.IgnoreProcessingInstructions = true; | |
settings.IgnoreWhitespace = true; | |
StreamWriter sw=new StreamWriter("dblp.f"); | |
sw.AutoFlush = true; | |
XmlReader reader = XmlReader.Create(f, settings); | |
reader.MoveToElement(); | |
while (!reader.EOF) | |
{ | |
try | |
{ | |
reader.MoveToContent(); | |
if (reader.Depth != 1) | |
{ | |
reader.Read(); | |
continue; | |
} | |
XDocument xDocFromNode = new XDocument(XDocument.ReadFrom(reader)); | |
var authorsInNode = (from authors in xDocFromNode.Descendants("author") | |
where (xDocFromNode.Root.Name.LocalName == "article" || xDocFromNode.Root.Name.LocalName == "inproceedings" ) | |
select authors.Value ).ToList(); | |
var title = xDocFromNode.Descendants("title").Select(e => e.Value).FirstOrDefault(); | |
title = clean_str(title); | |
string authors_s = ""; | |
foreach (var author in authorsInNode) | |
{ | |
authors_s = authors_s + clean_str(author.Trim()); | |
} | |
Console.WriteLine(authors_s + '_' + title); | |
sw.WriteLine(authors_s + '_' + title); | |
} | |
catch (Exception e) | |
{ | |
Console.WriteLine(e.Message); | |
reader.Skip(); | |
reader.MoveToElement(); | |
} | |
} | |
sw.Close(); | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="iso-8859-1"?> | |
<dblp> | |
<article mdate="2002-01-03" key="persons/Codd71a"> | |
<author>E. F. Codd</author> | |
<title>Further Normalization of the Data Base Relational Model.</title> | |
<journal>IBM Research Report, San Jose, California</journal> | |
<volume>RJ909</volume> | |
<month>August</month> | |
<year>1971</year> | |
<cdrom>ibmTR/rj909.pdf</cdrom> | |
<ee>db/labs/ibm/RJ909.html</ee> | |
</article> | |
<article mdate="2002-01-03" key="persons/Hall74"> | |
<author>Patrick A. V. Hall</author> | |
<title>Common Subexpression Identification in General Algebraic Systems.</title> | |
<journal>Technical Rep. UKSC 0060, IBM United Kingdom Scientific Centre</journal> | |
<month>November</month> | |
<year>1974</year> | |
</article> | |
<article mdate="2002-01-03" key="persons/Tresch96"> | |
<author>Markus Zürich Tresch</author> | |
<title>Principles of Distributed Object Database Languages.</title> | |
<journal>technical Report 248, ETH Zürich, Dept. of Computer Science</journal> | |
<month>July</month> | |
<year>1996</year> | |
</article> | |
</dblp> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment