Skip to content

Instantly share code, notes, and snippets.

@Khalefa
Last active August 29, 2015 14:05
Show Gist options
  • Save Khalefa/4dda1c043ecf674aee86 to your computer and use it in GitHub Desktop.
Save Khalefa/4dda1c043ecf674aee86 to your computer and use it in GitHub Desktop.
DBLP parser
<!ELEMENT dblp (article|inproceedings|proceedings|book|incollection|
phdthesis|mastersthesis|www)*>
<!ENTITY % field "author|editor|title|booktitle|pages|year|address|journal|volume|number|month|url|ee|cdrom|cite|publisher|note|crossref|isbn|series|school|chapter">
<!ELEMENT article (%field;)*>
<!ATTLIST article
key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
reviewid CDATA #IMPLIED
rating CDATA #IMPLIED
>
<!ELEMENT inproceedings (%field;)*>
<!ATTLIST inproceedings key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT proceedings (%field;)*>
<!ATTLIST proceedings key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT book (%field;)*>
<!ATTLIST book key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT incollection (%field;)*>
<!ATTLIST incollection key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT phdthesis (%field;)*>
<!ATTLIST phdthesis key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT mastersthesis (%field;)*>
<!ATTLIST mastersthesis key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT www (%field;)*>
<!ATTLIST www key CDATA #REQUIRED
mdate CDATA #IMPLIED
publtype CDATA #IMPLIED
>
<!ELEMENT author (#PCDATA)>
<!ATTLIST author
bibtex CDATA #IMPLIED
>
<!ELEMENT editor (#PCDATA)>
<!ELEMENT address (#PCDATA)>
<!ENTITY % titlecontents "#PCDATA|sub|sup|i|tt|ref">
<!ELEMENT title (%titlecontents;)*>
<!ATTLIST title
bibtex CDATA #IMPLIED
>
<!ELEMENT booktitle (#PCDATA)>
<!ELEMENT pages (#PCDATA)>
<!ELEMENT year (#PCDATA)>
<!ELEMENT journal (#PCDATA)>
<!ELEMENT volume (#PCDATA)>
<!ELEMENT number (#PCDATA)>
<!ELEMENT month (#PCDATA)>
<!ELEMENT url (#PCDATA)>
<!ELEMENT ee (#PCDATA)>
<!ELEMENT cite (#PCDATA)>
<!ELEMENT school (#PCDATA)>
<!ELEMENT publisher (#PCDATA)>
<!ATTLIST publisher
href CDATA #IMPLIED
>
<!ELEMENT note (#PCDATA)>
<!ATTLIST note
type CDATA #IMPLIED
>
<!ELEMENT cdrom (#PCDATA)>
<!ATTLIST cite
label CDATA #IMPLIED
>
<!ELEMENT crossref (#PCDATA)>
<!ELEMENT isbn (#PCDATA)>
<!ELEMENT chapter (#PCDATA)>
<!ELEMENT series (#PCDATA)>
<!ATTLIST series
href CDATA #IMPLIED
>
<!ELEMENT layout ANY>
<!ATTLIST layout
logo CDATA #IMPLIED
>
<!ELEMENT ref (#PCDATA)>
<!ATTLIST ref href CDATA #REQUIRED>
<!ELEMENT sup (%titlecontents;)*>
<!ELEMENT sub (%titlecontents;)*>
<!ELEMENT i (%titlecontents;)*>
<!ELEMENT tt (%titlecontents;)*>
<!ENTITY reg "&#174;">
<!ENTITY micro "&#181;">
<!ENTITY times "&#215;">
<!-- (C) International Organization for Standardization 1986
Permission to copy in any form is granted for use with
conforming SGML systems and applications as defined in
ISO 8879, provided this notice is included in all copies.
-->
<!-- Character entity set. Typical invocation:
<!ENTITY % HTMLlat1 PUBLIC
"ISO 8879-1986//ENTITIES Added Latin 1//EN//XML">
-->
<!-- This version of the entity set can be used with any SGML document
which uses ISO 8859-1 or ISO 10646 as its document character
set. This includes XML documents and ISO HTML documents.
-->
<!ENTITY Agrave "&#192;" ><!-- capital A, grave accent -->
<!ENTITY Aacute "&#193;" ><!-- capital A, acute accent -->
<!ENTITY Acirc "&#194;" ><!-- capital A, circumflex accent -->
<!ENTITY Atilde "&#195;" ><!-- capital A, tilde -->
<!ENTITY Auml "&#196;" ><!-- capital A, dieresis or umlaut mark -->
<!ENTITY Aring "&#197;" ><!-- capital A, ring -->
<!ENTITY AElig "&#198;" ><!-- capital AE diphthong (ligature) -->
<!ENTITY Ccedil "&#199;" ><!-- capital C, cedilla -->
<!ENTITY Egrave "&#200;" ><!-- capital E, grave accent -->
<!ENTITY Eacute "&#201;" ><!-- capital E, acute accent -->
<!ENTITY Ecirc "&#202;" ><!-- capital E, circumflex accent -->
<!ENTITY Euml "&#203;" ><!-- capital E, dieresis or umlaut mark -->
<!ENTITY Igrave "&#204;" ><!-- capital I, grave accent -->
<!ENTITY Iacute "&#205;" ><!-- capital I, acute accent -->
<!ENTITY Icirc "&#206;" ><!-- capital I, circumflex accent -->
<!ENTITY Iuml "&#207;" ><!-- capital I, dieresis or umlaut mark -->
<!ENTITY ETH "&#208;" ><!-- capital Eth, Icelandic -->
<!ENTITY Ntilde "&#209;" ><!-- capital N, tilde -->
<!ENTITY Ograve "&#210;" ><!-- capital O, grave accent -->
<!ENTITY Oacute "&#211;" ><!-- capital O, acute accent -->
<!ENTITY Ocirc "&#212;" ><!-- capital O, circumflex accent -->
<!ENTITY Otilde "&#213;" ><!-- capital O, tilde -->
<!ENTITY Ouml "&#214;" ><!-- capital O, dieresis or umlaut mark -->
<!ENTITY Oslash "&#216;" ><!-- capital O, slash -->
<!ENTITY Ugrave "&#217;" ><!-- capital U, grave accent -->
<!ENTITY Uacute "&#218;" ><!-- capital U, acute accent -->
<!ENTITY Ucirc "&#219;" ><!-- capital U, circumflex accent -->
<!ENTITY Uuml "&#220;" ><!-- capital U, dieresis or umlaut mark -->
<!ENTITY Yacute "&#221;" ><!-- capital Y, acute accent -->
<!ENTITY THORN "&#222;" ><!-- capital THORN, Icelandic -->
<!ENTITY szlig "&#223;" ><!-- small sharp s, German (sz ligature) -->
<!ENTITY agrave "&#224;" ><!-- small a, grave accent -->
<!ENTITY aacute "&#225;" ><!-- small a, acute accent -->
<!ENTITY acirc "&#226;" ><!-- small a, circumflex accent -->
<!ENTITY atilde "&#227;" ><!-- small a, tilde -->
<!ENTITY auml "&#228;" ><!-- small a, dieresis or umlaut mark -->
<!ENTITY aring "&#229;" ><!-- small a, ring -->
<!ENTITY aelig "&#230;" ><!-- small ae diphthong (ligature) -->
<!ENTITY ccedil "&#231;" ><!-- small c, cedilla -->
<!ENTITY egrave "&#232;" ><!-- small e, grave accent -->
<!ENTITY eacute "&#233;" ><!-- small e, acute accent -->
<!ENTITY ecirc "&#234;" ><!-- small e, circumflex accent -->
<!ENTITY euml "&#235;" ><!-- small e, dieresis or umlaut mark -->
<!ENTITY igrave "&#236;" ><!-- small i, grave accent -->
<!ENTITY iacute "&#237;" ><!-- small i, acute accent -->
<!ENTITY icirc "&#238;" ><!-- small i, circumflex accent -->
<!ENTITY iuml "&#239;" ><!-- small i, dieresis or umlaut mark -->
<!ENTITY eth "&#240;" ><!-- small eth, Icelandic -->
<!ENTITY ntilde "&#241;" ><!-- small n, tilde -->
<!ENTITY ograve "&#242;" ><!-- small o, grave accent -->
<!ENTITY oacute "&#243;" ><!-- small o, acute accent -->
<!ENTITY ocirc "&#244;" ><!-- small o, circumflex accent -->
<!ENTITY otilde "&#245;" ><!-- small o, tilde -->
<!ENTITY ouml "&#246;" ><!-- small o, dieresis or umlaut mark -->
<!ENTITY oslash "&#248;" ><!-- small o, slash -->
<!ENTITY ugrave "&#249;" ><!-- small u, grave accent -->
<!ENTITY uacute "&#250;" ><!-- small u, acute accent -->
<!ENTITY ucirc "&#251;" ><!-- small u, circumflex accent -->
<!ENTITY uuml "&#252;" ><!-- small u, dieresis or umlaut mark -->
<!ENTITY yacute "&#253;" ><!-- small y, acute accent -->
<!ENTITY thorn "&#254;" ><!-- small thorn, Icelandic -->
<!ENTITY yuml "&#255;" ><!-- small y, dieresis or umlaut mark -->
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Xml;
using System.Xml.Linq;
using System.IO;
namespace dblp
{
class Program
{
static string clean_str(string str)
{
char[] arr = str.ToLower().Where(c =>(char.IsLetterOrDigit(c) ||
char.IsWhiteSpace(c))
).ToArray();
for (int i =0;i< arr.Length;i++)
{
char c=arr[i];
if (char.IsWhiteSpace(c))
arr[i] = '_';
}
return new string(arr);
}
static void Main(string[] args)
{
string f = @"C:\Users\khalefa\SkyDrive\Alex Work\Work\Edit Distance\datasets\srcs\dblp.xml\dblp.xml";
f=@"dblp.xml";
//XmlDocument doc = new XmlDocument();
XmlReaderSettings settings = new XmlReaderSettings();
settings.ProhibitDtd = false;
settings.ValidationType = ValidationType.None;
settings.CheckCharacters = false;
settings.IgnoreWhitespace = true;
settings.IgnoreComments = true;
settings.IgnoreProcessingInstructions = true;
settings.IgnoreWhitespace = true;
StreamWriter sw=new StreamWriter("dblp.f");
sw.AutoFlush = true;
XmlReader reader = XmlReader.Create(f, settings);
reader.MoveToElement();
while (!reader.EOF)
{
try
{
reader.MoveToContent();
if (reader.Depth != 1)
{
reader.Read();
continue;
}
XDocument xDocFromNode = new XDocument(XDocument.ReadFrom(reader));
var authorsInNode = (from authors in xDocFromNode.Descendants("author")
where (xDocFromNode.Root.Name.LocalName == "article" || xDocFromNode.Root.Name.LocalName == "inproceedings" )
select authors.Value ).ToList();
var title = xDocFromNode.Descendants("title").Select(e => e.Value).FirstOrDefault();
title = clean_str(title);
string authors_s = "";
foreach (var author in authorsInNode)
{
authors_s = authors_s + clean_str(author.Trim());
}
Console.WriteLine(authors_s + '_' + title);
sw.WriteLine(authors_s + '_' + title);
}
catch (Exception e)
{
Console.WriteLine(e.Message);
reader.Skip();
reader.MoveToElement();
}
}
sw.Close();
}
}
}
<?xml version="1.0" encoding="iso-8859-1"?>
<dblp>
<article mdate="2002-01-03" key="persons/Codd71a">
<author>E. F. Codd</author>
<title>Further Normalization of the Data Base Relational Model.</title>
<journal>IBM Research Report, San Jose, California</journal>
<volume>RJ909</volume>
<month>August</month>
<year>1971</year>
<cdrom>ibmTR/rj909.pdf</cdrom>
<ee>db/labs/ibm/RJ909.html</ee>
</article>
<article mdate="2002-01-03" key="persons/Hall74">
<author>Patrick A. V. Hall</author>
<title>Common Subexpression Identification in General Algebraic Systems.</title>
<journal>Technical Rep. UKSC 0060, IBM United Kingdom Scientific Centre</journal>
<month>November</month>
<year>1974</year>
</article>
<article mdate="2002-01-03" key="persons/Tresch96">
<author>Markus Z&uuml;rich Tresch</author>
<title>Principles of Distributed Object Database Languages.</title>
<journal>technical Report 248, ETH Z&uuml;rich, Dept. of Computer Science</journal>
<month>July</month>
<year>1996</year>
</article>
</dblp>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment