Skip to content

Instantly share code, notes, and snippets.

@crazy4groovy
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save crazy4groovy/9cc65a64601ea50150ab to your computer and use it in GitHub Desktop.
Save crazy4groovy/9cc65a64601ea50150ab to your computer and use it in GitHub Desktop.
Extracts XML data into CSV format. Can especially help interpret large BISAC XML files.
/**
* @name nodeDataExtractor.groovy
* @author Steven Olsen <spam2steve@gmail.com>
* @description Extracts XML data into CSV format. Can especially help interpret large BISAC XML files.
* https://www.bisg.org/complete-bisac-subject-headings-2014-edition
* @version 0.1
*/
/////////////////////////////
File f = new File(args.size() >= 1 ? args[0] : /C:\input.xml/)
String BASE_NODE = args.size() == 2 ? args[1] : 'product'
boolean isBisacXML = args.size() == 2 ? false : true
boolean truncateText = args.size() == 2 ? false : true
String NODE_DELIMITER = '/'
/////////////////////////////
def root = new XmlSlurper()
// allow top XML doctype tag to exist
root.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
root = root.parse(f)
//String rootName = root.name()
List<Map> nodes = []
Map nodeKeysCtr = [:].withDefault{0}
// start at BASE_NODE in the node tree
root[BASE_NODE].each { p ->
Map product = [:].withDefault{[]}
p.'**'.each{ n ->
String text = n.text();
if (truncateText) text = text.contains("<") ? "...long text..." : text
String name = ""
while(n.name() != BASE_NODE) {
name = NODE_DELIMITER + n.name() + name
n = n.parent()
}
if (name) {
nodeKeysCtr[name]++
product[name] << text
}
}
//println product
nodes << product
}
Map nodeLookup = getLookup(isBisacXML)
//each column is a BASE_NODE
println "Node,Human Name,# times used (total ${BASE_NODE}'s:${root.product.size()}),${BASE_NODE}1,${BASE_NODE}2,Etc..."
// each row is the prop for all BASE_NODEs
nodeKeysCtr.keySet().toList().sort().each { prop ->
if (isBisacXML && !(prop ==~ /.*\d.*/)) return // only print nodes with an int in them
String bareProp = prop.split(NODE_DELIMITER)[-1]
print "$prop,${nodeLookup[ bareProp ]},${nodeKeysCtr[prop]},"
nodes.each { p ->
print "${p[prop].toString().replaceAll('[,\n\r]',';')},"
}
}
//END//
/////////////////////////////////////////////////
/////////////////////////////////////////////////
//UTILS//
Map getLookup(returnVals) {
if (!returnVals) return ([:].withDefault{it})
return ([a001:'RecordReference',
a002:'NotificationType',
a194:'RecordSourceType',
a195:'RecordSourceIdentifierType',
a196:'RecordSourceIdentifier',
a197:'RecordSourceName',
a198:'DeletionCode',
a199:'DeletionText',
a245:'SubordinateEntries',
b003:'PublicationDate',
b004:'ISBN',
b005:'EAN13',
b006:'UPC',
b007:'PublisherProductNo',
b008:'ISMN',
b009:'DOI',
b010:'ReplacesISBN',
b011:'ReplacesEAN13',
b012:'ProductForm',
b013:'BookFormDetail',
b014:'ProductFormDescription',
b015:'ItemQuantity',
b016:'SeriesISSN',
b017:'PublisherSeriesCode',
b018:'TitleOfSeries',
b019:'NumberWithinSeries',
b020:'YearOfAnnual',
b021:'ISBNOfSet',
b022:'EAN13OfSet',
b023:'TitleOfSet',
b024:'SetPartNumber',
b025:'SetPartTitle',
b026:'ItemNumberWithinSet',
b027:'TextCaseFlag',
b028:'DistinctiveTitle',
b029:'Subtitle',
b030:'TitlePrefix',
b031:'TitleWithoutPrefix',
b032:'TranslationOfTitle',
b033:'FormerTitle',
b034:'SequenceNumber',
b035:'ContributorRole',
b036:'PersonName',
b037:'PersonNameInverted',
b038:'TitlesBeforeNames',
b039:'NamesBeforeKey',
b040:'KeyNames',
b041:'NamesAfterKey',
b042:'LettersAfterNames',
b043:'TitlesAfterNames',
b044:'BiographicalNote',
b045:'ProfessionalPosition',
b046:'Affiliation',
b047:'CorporateName',
b048:'ContributorDescription',
b049:'ContributorStatement',
b050:'ConferenceDescription',
b051:'ConferenceRole',
b052:'ConferenceName',
b053:'ConferenceNumber',
b054:'ConferenceDate',
b055:'ConferencePlace',
b056:'EditionTypeCode',
b057:'EditionNumber',
b058:'EditionStatement',
b059:'LanguageOfText',
b060:'OriginalLanguage',
b061:'NumberOfPages',
b062:'IllustrationsNote',
b063:'MapScale',
b064:'BASICMainSubject',
b065:'BICMainSubject',
b066:'BICVersion',
b067:'SubjectSchemeIdentifier',
b068:'SubjectSchemeVersion',
b069:'SubjectCode',
b070:'SubjectHeadingText',
b071:'CorporateBodyAsSubject',
b072:'PlaceAsSubject',
b073:'AudienceCode',
b074:'AudienceRangeQualifier',
b075:'AudienceRangePrecision',
b076:'AudienceRangeValue',
b077:'ComplexitySchemeIdentifier',
b078:'ComplexityCode',
b079:'ImprintName',
b081:'PublisherName',
b083:'CountryOfPublication',
b084:'CopublisherName',
b085:'SponsorName',
b086:'AnnouncementDate',
b087:'CopyrightYear',
b088:'YearFirstPublished',
b089:'SalesRightsType',
b090:'RightsCountry',
b091:'RightsRegion',
b125:'NumberOfIllustrations',
b171:'SubjectSchemeName',
b189:'USSchoolGrade',
b190:'InterestAge',
b191:'MainSubjectSchemeIdentifier',
b200:'BASICVersion',
b201:'WorkIDType',
b202:'TitleType',
b203:'TitleText',
b204:'AudienceCodeType',
b205:'AudienceCodeTypeName',
b206:'AudienceCodeValue',
b207:'AudienceDescription',
b209:'CityOfPublication',
b210:'NumberOfPieces',
b211:'EpubType',
b212:'EpubTypeVersion',
b213:'EpubTypeDescription',
b214:'EpubFormat',
b215:'EpubFormatVersion',
b216:'EpubFormatDescription',
b217:'EditionVersionNumber',
b218:'ExtentType',
b219:'ExtentValue',
b220:'ExtentUnit',
b221:'ProductIDType',
b225:'ProductPackaging',
b233:'IDTypeName',
b240:'OriginalPublisher',
b241:'NameCodeType',
b242:'NameCodeTypeName',
b243:'NameCodeValue',
b244:'IDValue',
b246:'Barcode',
b247:'PrefixToKey',
b248:'SuffixToKey',
b249:'UnnamedPersons',
b250:'PersonNameType',
b251:'CountryCode',
b252:'LanguageCode',
b253:'LanguageRole',
b254:'PagesRoman',
b255:'PagesArabic',
b256:'IllustrationType',
b257:'Number',
b273:'SeriesIDType',
b274:'ProductClassificationType',
b275:'ProductClassificationCode',
b276:'AbbreviatedLength',
b277:'EpubTypeNote',
b278:'EpubSource',
b279:'EpubSourceVersion',
b280:'EpubSourceDescription',
b281:'SetItemTitle',
b282:'SeriesPartName',
b284:'LevelSequenceNumber',
b285:'TextItemIDType',
b286:'FirstPageNumber',
b287:'LastPageNumber',
b288:'ComponentTypeName',
b289:'ComponentNumber',
b290:'TextItemType',
b291:'PublishingRole',
b294:'WebsiteDescription',
b295:'WebsiteLink',
b305:'PersonDateRole',
b306:'Date',
b324:'StartDate',
b325:'EndDate',
b333:'ProductFormDetail',
b334:'ProductFormFeatureType',
b335:'ProductFormFeatureValue',
b336:'ProductFormFeatureDescription',
b337:'Percent',
b340:'SequenceNumberWithinRole',
b341:'ConferenceAcronym',
b342:'ConferenceTheme',
b352:'BibleContents',
b353:'BibleVersion',
b354:'BiblePurpose',
b355:'BibleTextOrganization',
b356:'BibleReferenceLocation',
b357:'BibleTextFeature',
b358:'ReligiousTextFeatureType',
b359:'ReligiousTextFeatureCode',
b360:'ReligiousTextFeatureDescript’n',
b361:'IllustrationTypeDescription',
b362:'TradeAnnouncementDate',
b367:'WebsiteRole',
b368:'ThesisType',
b369:'ThesisPresentedTo',
b370:'ThesisYear',
b374:'TextSourceCorporate',
b376:'ReligiousTextID',
b381:'SalesRestrictionType',
b382:'SalesOutletName',
b383:'SalesRestrictionDetail',
b384:'TradeCategory',
b385:'ProductContentType',
b388:'RightsTerritory',
b389:'StudyBibleType',
b390:'PersonNameIDType',
b391:'ConferenceSponsorIDType',
b392:'CopyrightOwnerIDType',
b393:'SalesOutletIDType',
b394:'PublishingStatus',
b395:'PublishingStatusNote',
b398:'RegionCode',
c093:'MeasureTypeCode',
c094:'Measurement',
c095:'MeasureUnitCode',
c096:'Height',
c097:'Width',
c098:'Thickness',
c099:'Weight',
c258:'Dimensions',
d100:'Annotation',
d101:'MainDescription',
d102:'TextTypeCode',
d103:'TextFormat',
d104:'Text',
d105:'TextLinkType',
d106:'TextLink',
d107:'TextAuthor',
d108:'TextSourceTitle',
d109:'TextPublicationDate',
e110:'ReviewQuote',
f111:'CoverImageFormatCode',
f112:'CoverImageLinkTypeCode',
f113:'CoverImageLink',
f114:'MediaFileTypeCode',
f115:'MediaFileFormatCode',
f116:'MediaFileLinkTypeCode',
f117:'MediaFileLink',
f118:'TextWithDownload',
f119:'DownloadCaption',
f120:'DownloadCredit',
f121:'DownloadCopyrightNotice',
f122:'DownloadTerms',
f123:'ProductWebsiteLink',
f170:'ProductWebsiteDescription',
f259:'ImageResolution',
f373:'MediaFileDate',
g124:'PrizesDescription',
g126:'PrizeName',
g127:'PrizeYear',
g128:'PrizeCountry',
g129:'PrizeCode',
g343:'PrizeJury',
h130:'ReplacedByISBN',
h131:'ReplacedByEAN13',
h132:'AlternativeFormatISBN',
h133:'AlternativeFormatEAN13',
h134:'OutOfPrintDate',
h163:'AlternativeProductISBN',
h164:'AlternativeProductEAN13',
h208:'RelationCode',
j135:'SupplierEANLocationNumber',
j136:'SupplierSAN',
j137:'SupplierName',
j138:'SupplyToCountry',
j139:'SupplyToRegion',
j140:'SupplyToCountryExcluded',
j141:'AvailabilityCode',
j142:'ExpectedShipDate',
j143:'OnSaleDate',
j144:'OrderTime',
j145:'PackQuantity',
j146:'AudienceRestrictionFlag',
j147:'AudienceRestrictionNote',
j148:'PriceTypeCode',
j149:'ClassOfTrade',
j150:'BICDiscountGroupCode',
j151:'PriceAmount',
j152:'CurrencyCode',
j153:'TaxRateCode1',
j154:'TaxRatePercent1',
j155:'TaxableAmount1',
j156:'TaxAmount1',
j157:'TaxRateCode2',
j158:'TaxRatePercent2',
j159:'TaxableAmount2',
j160:'TaxAmount2',
j161:'PriceEffectiveFrom',
j162:'PriceEffectiveUntil',
j192:'UnpricedItemType',
j239:'PricePer',
j260:'DateFormat',
j261:'PriceQualifier',
j262:'PriceTypeDescription',
j263:'MinimumOrderQuantity',
j264:'BatchQuantity',
j265:'FreeQuantity',
j266:'PriceStatus',
j267:'DiscountPercent',
j268:'ReturnsCodeType',
j269:'ReturnsCode',
j270:'TelephoneNumber',
j271:'FaxNumber',
j272:'EmailAddress',
j292:'SupplierRole',
j296:'StockQuantityCodeTypeName',
j297:'StockQuantityCode',
j302:'ExpectedDate',
j303:'Territory',
j304:'CountryExcluded',
j308:'TerritoryExcluded',
j345:'SupplierIDType',
j348:'IntermediaryAvailabilityCode',
j349:'LocationName',
j350:'OnHand',
j351:'OnOrder',
j363:'DiscountCodeType',
j364:'DiscountCode',
j365:'ReissueDate',
j366:'ReissueDescription',
j375:'CBO',
j377:'LocationIDType',
j378:'DiscountCodeTypeName',
j387:'LastDateForReturns',
j396:'ProductAvailability',
j397:'SupplyToTerritory',
j399:'SupplyRestrictionDetail',
j400:'AgentIDType',
j401:'AgentName',
j402:'AgentRole',
j403:'MarketCountry',
j404:'MarketTerritory',
j405:'MarketCountryExcluded',
j406:'MarketRestrictionDetail',
j407:'MarketPublishingStatus',
j408:'MarketDateRole',
k165:'PromotionCampaign',
k166:'PromotionContact',
k167:'InitialPrintRun',
k168:'CopiesSold',
k169:'BookClubAdoption',
k309:'ReprintDetail',
m172:'FromEANNumber',
m173:'FromSAN',
m174:'FromCompany',
m175:'FromPerson',
m176:'ToEANNumber',
m177:'ToSAN',
m178:'ToCompany',
m179:'ToPerson',
m180:'MessageNumber',
m181:'MessageRepeat',
m182:'SentDate',
m183:'MessageNote',
m184:'DefaultLanguageOfText',
m185:'DefaultPriceTypeCode',
m186:'DefaultCurrencyCode',
m187:'DefaultLinearUnit',
m188:'DefaultWeightUnit',
m193:'DefaultClassOfTrade',
m283:'FromEmail',
m379:'SenderIDType',
m380:'AddresseeIDType',
n338:'NoSeries',
n339:'NoContributor',
n386:'NoEdition'
].withDefault{it})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment