From 3d881eec13524c58c51389d8fe9956f40a92ed96 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sat, 17 Jun 2006 21:21:15 +0000 Subject: [PATCH] - Make scrapers return standard ISO-style YYYY-MM-DD dates. Still need to work on journal article scrapers. - Ingester lets callback function save items, rather than saving them itself. - Better handling of multiple items in API, although no scrapers currently implement this. --- .../content/scholar/ingester/browser.js | 21 +- .../content/scholar/xpcom/ingester.js | 210 +++++++++--------- scrapers.sql | 30 ++- 3 files changed, 148 insertions(+), 113 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index ad768d915f..23fd5c9bfe 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -211,18 +211,20 @@ Scholar.Ingester.Interface._deleteDocument = function(browser) { /* * Callback to be executed when scraping is complete */ -Scholar.Ingester.Interface._finishScraping = function(documentObject) { - if(documentObject.item) { +Scholar.Ingester.Interface._finishScraping = function(obj) { + if(obj.items.length) { + var item1 = obj.items[0]; + Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); - var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID")); + var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID")); var titleLabel = Scholar.getString("itemFields.title") + ":" - Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title")); - var creators = documentObject.item.numCreators(); + Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, item1.getField("title")); + var creators = item1.numCreators(); if(creators) { for(var i=0; i= 4) { + newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); } - } - if(this.model.data[uri][prefixDC + 'edition']) { - newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]); - } - if(this.model.data[uri][prefixDummy + 'series']) { - newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); - } - if(this.model.data[uri][prefixDummy + 'place']) { - newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); - } - if(this.model.data[uri][prefixDC + 'identifier']) { - for(i in this.model.data[uri][prefixDC + 'identifier']) { - if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { - newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); - break; + if(this.model.data[uri][prefixDC + 'hasVersion']) { + newItem.setField("edition", this.model.data[uri][prefixDC + 'hasVersion'][0]); + } + if(this.model.data[uri][prefixDummy + 'series']) { + newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); + } + if(this.model.data[uri][prefixDummy + 'place']) { + newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); + } + if(this.model.data[uri][prefixDC + 'identifier']) { + for(i in this.model.data[uri][prefixDC + 'identifier']) { + if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { + newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); + break; + } } } } + this.items.push(newItem); } - newItem.save(); - - // First one is stored so as to be accessible - if(!this.item) { - this.item = newItem; - } + } catch(ex) { + Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); } } \ No newline at end of file diff --git a/scrapers.sql b/scrapers.sql index 1f2efb81e2..ae14f1e3aa 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,6 +22,30 @@ var cleanString = function(s) { return s.replace(/ +/g, " "); } +var dateToISO = function(jsDate) { + var date = ""; + var year = jsDate.getFullYear().toString(); + var month = (jsDate.getMonth()+1).toString(); + var day = jsDate.getDate().toString(); + + for(var i = year.length; i<4; i++) { + date += "0"; + } + date += year+"-"; + + if(month.length == 1) { + date += "0"; + } + date += month+"-"; + + if(day.length == 1) { + date += "0"; + } + date += day; + + return date; +} + var uri = doc.location.href; model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); @@ -43,10 +67,12 @@ for (var i = 0; i < elmts.length; i++) { var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) { var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); - if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { - var date = value.substring(value.lastIndexOf("(")+1, value.length-1); + var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1); + jsDate = new Date(jsDate); + var date = dateToISO(jsDate); + value = value.substring(0, value.lastIndexOf("(")-1); } if(value.lastIndexOf(";") != -1) {