Still getting the hang of Subversion...the rest of the ingester code
This commit is contained in:
parent
24d35c5547
commit
551582eb7e
6 changed files with 663 additions and 0 deletions
140
chrome/chromeFiles/content/scholar/ingester/browser.js
Normal file
140
chrome/chromeFiles/content/scholar/ingester/browser.js
Normal file
|
@ -0,0 +1,140 @@
|
|||
// Firefox Scholar Ingester Browser Functions
|
||||
// Utilities based on code taken from Greasemonkey
|
||||
// This code is licensed according to the GPL
|
||||
|
||||
// Prepare the browser and collector instrumentation caches --------------------
|
||||
Scholar.Ingester.Interface = function() {}
|
||||
|
||||
Scholar.Ingester.Interface.init = function() {
|
||||
Scholar.Ingester.Interface.browsers = new Array();
|
||||
|
||||
window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false);
|
||||
window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false);
|
||||
|
||||
Scholar.Ingester.Interface.browsers = new Array();
|
||||
Scholar.Ingester.Interface.browserDocuments = new Object();
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.chromeLoad = function() {
|
||||
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
|
||||
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
|
||||
Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
|
||||
|
||||
// this gives us onLocationChange
|
||||
Scholar.Ingester.Interface.tabBrowser.addProgressListener(Scholar.Ingester.Interface.Listener,
|
||||
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
||||
// this gives us DOMContentLoaded
|
||||
Scholar.Ingester.Interface.appContent.addEventListener("DOMContentLoaded",
|
||||
Scholar.Ingester.Interface.contentLoad, true);
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.chromeUnload = function() {
|
||||
this.tabBrowser.removeProgressListener(this);
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.getDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
if(Scholar.Ingester.Interface.browserDocuments[key]) {
|
||||
return Scholar.Ingester.Interface.browserDocuments[key];
|
||||
}
|
||||
} finally {}
|
||||
return false;
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.setDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
} finally {
|
||||
if(!key) {
|
||||
var key = (new Date()).getTime();
|
||||
browser.setAttribute("scholar-key", key);
|
||||
}
|
||||
}
|
||||
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
|
||||
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.deleteDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
if(Scholar.Ingester.Interface.browserDocuments[key]) {
|
||||
delete Scholar.Ingester.Interface.browserDocuments[key];
|
||||
return true;
|
||||
}
|
||||
} finally {}
|
||||
return false;
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.scrapeThisPage = function() {
|
||||
var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||
if(document.scraper) {
|
||||
document.scrapePage();
|
||||
}
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.updateStatus = function(browser) {
|
||||
var document = Scholar.Ingester.Interface.getDocument(browser);
|
||||
if(document && document.scraper) {
|
||||
this.statusImage.src = "chrome://scholar/skin/capture_colored.png";
|
||||
} else {
|
||||
this.statusImage.src = "chrome://scholar/skin/capture_gray.png";
|
||||
}
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.contentLoad = function() {
|
||||
Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||
Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||
}
|
||||
|
||||
Scholar.Ingester.Interface.Listener = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onStatusChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onSecurityChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onProgressChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onStateChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onLocationChange = function() {
|
||||
var browsers = Scholar.Ingester.Interface.tabBrowser.browsers;
|
||||
|
||||
// Remove document object of any browser that no longer exists
|
||||
for (var i = 0; i < Scholar.Ingester.Interface.browsers.length; i++) {
|
||||
var browser = Scholar.Ingester.Interface.browsers[i];
|
||||
var exists = false;
|
||||
|
||||
for (var j = 0; j < browsers.length; j++) {
|
||||
if (browser == browsers[j]) {
|
||||
exists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!exists) {
|
||||
Scholar.Ingester.Interface.browsers.splice(i,1);
|
||||
|
||||
// To execute if document object does not exist
|
||||
Scholar.Ingester.Interface.deleteDocument(browser);
|
||||
}
|
||||
}
|
||||
|
||||
/*// Add a collector to any new browser
|
||||
for (var i = 0; i < browsers.length; i++) {
|
||||
var browser = browsers[i];
|
||||
var exists = false;
|
||||
|
||||
for (var j = 0; j < Scholar.Ingester.Interface.browsers.length; j++) {
|
||||
if (browser == Scholar.Ingester.Interface.browsers[j]) {
|
||||
exists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!exists) {
|
||||
Scholar.Ingester.Interface.browsers.splice(i,0,browser);
|
||||
|
||||
// To execute if window is new
|
||||
}
|
||||
}*/
|
||||
|
||||
Scholar.Ingester.Interface.updateStatus(
|
||||
Scholar.Ingester.Interface.tabBrowser.selectedBrowser
|
||||
);
|
||||
}
|
23
chrome/chromeFiles/content/scholar/ingester/browser.xul
Executable file
23
chrome/chromeFiles/content/scholar/ingester/browser.xul
Executable file
|
@ -0,0 +1,23 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
|
||||
<!-- Note: Contains Firefox-specific overlay -->
|
||||
|
||||
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
|
||||
|
||||
<script src="../include.js"/>
|
||||
|
||||
<script src="ingester.js"/>
|
||||
<script src="browser.js"/>
|
||||
|
||||
<script type="application/x-javascript">
|
||||
Scholar.Ingester.Interface.init();
|
||||
</script>
|
||||
|
||||
<statusbar id="status-bar">
|
||||
<statusbarpanel id="schlar-status" insertafter="livemark-button">
|
||||
<label id="scholar-status-label" collapsed="true" crop="end" style="width:0px" />
|
||||
<image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
|
||||
</statusbarpanel>
|
||||
</statusbar>
|
||||
</overlay>
|
473
chrome/chromeFiles/content/scholar/ingester/ingester.js
Normal file
473
chrome/chromeFiles/content/scholar/ingester/ingester.js
Normal file
|
@ -0,0 +1,473 @@
|
|||
// Firefox Scholar Ingester
|
||||
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
|
||||
// This code is licensed according to the GPL
|
||||
|
||||
Scholar.Ingester = new function() {}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.Model
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
// Scholar.Ingester.Model, an object representing an RDF data model with
|
||||
// methods to add to that model. In Piggy Bank, this was implemented in Java,
|
||||
// but seeing as we don't really want an enormous web server running with FS,
|
||||
// but we don't actually need that, so it's much simpler.
|
||||
//
|
||||
// The Java version of this class can be viewed at
|
||||
// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java
|
||||
Scholar.Ingester.Model = function() {
|
||||
this.data = new Object();
|
||||
}
|
||||
|
||||
// Piggy Bank provides a fourth argument, one that determines if the third
|
||||
// argument is a literal or an RDF URI. Since our ontologies are
|
||||
// sufficiently restricted, we have no chance of confusing a literal and an
|
||||
// RDF URI and thus this is unnecessary.
|
||||
Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
|
||||
if(!this.data[uri]) this.data[uri] = new Object();
|
||||
this.data[uri][rdfUri] = literal;
|
||||
Scholar.debug(rdfUri+" for "+uri+" is "+literal);
|
||||
}
|
||||
|
||||
// Additional functions added for compatibility purposes only
|
||||
// No idea if any scraper actually uses these, but just in case, they're
|
||||
// implemented so as not to throw an exception
|
||||
Scholar.Ingester.Model.prototype.addTag = function() {}
|
||||
Scholar.Ingester.Model.prototype.getRepository = function() {}
|
||||
Scholar.Ingester.Model.prototype.detachRepository = function() {}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.Utilities
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
||||
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
||||
Scholar.Ingester.Utilities = function() {}
|
||||
|
||||
// Adapter for Piggy Bank function to print debug messages; log level is
|
||||
// fixed at 4 (could change this)
|
||||
Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {
|
||||
Scholar.debug(msg, 4);
|
||||
}
|
||||
|
||||
// Appears to trim a string, chopping of newlines/spacing
|
||||
Scholar.Ingester.Utilities.prototype.trimString = function(s) {
|
||||
var i = 0;
|
||||
var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */;
|
||||
while (i < s.length) {
|
||||
var c = s.charAt(i);
|
||||
if (spaceChars.indexOf(c) < 0) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
s = s.substring(i);
|
||||
|
||||
i = s.length;
|
||||
while (i > 0) {
|
||||
var c = s.charAt(i - 1);
|
||||
if (spaceChars.indexOf(c) < 0) {
|
||||
break;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
|
||||
return s.substring(0, i);
|
||||
}
|
||||
|
||||
// Takes an XPath query and returns the results
|
||||
Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
|
||||
var elmts = [];
|
||||
|
||||
var iterator = doc.evaluate(xpath, parentNode, nsResolver, XPathResult.ANY_TYPE,null);
|
||||
var elmt = iterator.iterateNext();
|
||||
var i = 0;
|
||||
while (elmt) {
|
||||
elmts[i++] = elmt;
|
||||
elmt = iterator.iterateNext();
|
||||
}
|
||||
return elmts;
|
||||
}
|
||||
|
||||
// Loads a single document for a scraper, running succeeded() on success or
|
||||
// failed() on failure
|
||||
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
|
||||
this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
|
||||
}
|
||||
|
||||
// Downloads and processes documents with processor()
|
||||
// browser - a browser object
|
||||
// firstDoc - the first document to process with the processor (if null,
|
||||
// first document is processed without processor)
|
||||
// urls - an array of URLs to load
|
||||
// processor - a function to execute to process each document
|
||||
// done - a function to execute when all document processing is complete
|
||||
// exception - a function to execute if an exception occurs (exceptions are
|
||||
// also logged in the Firefox Scholar log)
|
||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||
try {
|
||||
if (urls.length == 0) {
|
||||
if (firstDoc) {
|
||||
processor(firstDoc, done);
|
||||
} else {
|
||||
done();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
var urlIndex = -1;
|
||||
var doLoad = function() {
|
||||
urlIndex++;
|
||||
if (urlIndex < urls.length) {
|
||||
try {
|
||||
var url = urls[urlIndex];
|
||||
var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
|
||||
b.loadURI(url);
|
||||
} catch (e) {
|
||||
exception(e);
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
|
||||
}
|
||||
} else {
|
||||
window.setTimeout(done, 10);
|
||||
}
|
||||
};
|
||||
var onLoad = function() {
|
||||
try {
|
||||
var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
|
||||
processor(b.contentDocument, doLoad);
|
||||
} catch (e) {
|
||||
exception(e);
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||
}
|
||||
};
|
||||
var init = function() {
|
||||
var listener;
|
||||
listener.onStateChange = function(webProgress, request, stateFlags, status) {
|
||||
if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
|
||||
request.name == urls[urlIndex]) {
|
||||
try {
|
||||
Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
|
||||
} catch (e) {
|
||||
exception(e);
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
|
||||
tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
|
||||
|
||||
if (firstDoc) {
|
||||
processor(firstDoc, doLoad);
|
||||
} else {
|
||||
doLoad();
|
||||
}
|
||||
}
|
||||
|
||||
w.addEventListener("load", init, false);
|
||||
} catch (e) {
|
||||
exception(e);
|
||||
PB_Debug.print("processDocuments: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
// Appears to look for links in a document containing a certain substring
|
||||
Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {
|
||||
var urls = [];
|
||||
var addedURLs = [];
|
||||
|
||||
var aElements = doc.evaluate("//a", doc, null, XPathResult.ANY_TYPE,null);
|
||||
var aElement = aElements.iterateNext();
|
||||
while (aElement) {
|
||||
var href = aElement.href;
|
||||
if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
|
||||
urls.unshift(href);
|
||||
addedURLs[href] = true;
|
||||
}
|
||||
aElement = aElements.iterateNext();
|
||||
}
|
||||
return urls;
|
||||
}
|
||||
|
||||
// For now, we're going to skip the getLLsFromAddresses function (which gets
|
||||
// latitude and longitude pairs from a series of addresses, but requires the
|
||||
// big mess of Java code that is the Piggy Bank server) and the geoHelper
|
||||
// tools (which rely on getLLsFromAddresses) since these are probably not
|
||||
// essential components for Scholar and would take a great deal of effort to
|
||||
// implement. We can, however, always implement them later.
|
||||
|
||||
// It looks like these are simple front-ends for XMLHttpRequest. They're a
|
||||
// component of the Piggy Bank API, so they're implemented here.
|
||||
Scholar.Ingester.Utilities.HTTPUtilities = function() {}
|
||||
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
|
||||
var xmlhttp = new XMLHttpRequest();
|
||||
|
||||
xmlhttp.open('GET', url, true);
|
||||
xmlhttp.overrideMimeType("text/xml");
|
||||
xmlhttp.onreadystatechange = function() {
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
|
||||
};
|
||||
xmlhttp.send(null);
|
||||
}
|
||||
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
|
||||
var xmlhttp = new XMLHttpRequest();
|
||||
|
||||
xmlhttp.open('POST', url, true);
|
||||
xmlhttp.overrideMimeType("text/xml");
|
||||
xmlhttp.onreadystatechange = function() {
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
|
||||
};
|
||||
xmlhttp.send(body);
|
||||
}
|
||||
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
|
||||
var xmlhttp = new XMLHttpRequest();
|
||||
|
||||
xmlhttp.open('OPTIONS', url, true);
|
||||
xmlhttp.overrideMimeType("text/xml");
|
||||
xmlhttp.onreadystatechange = function() {
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
|
||||
};
|
||||
xmlhttp.send(body);
|
||||
}
|
||||
|
||||
// Possible point of failure; for some reason, this used to be a separate
|
||||
// class, so make sure it works
|
||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
|
||||
switch (xmlhttp.readyState) {
|
||||
|
||||
// Request not yet made
|
||||
case 1:
|
||||
break;
|
||||
|
||||
// Contact established with server but nothing downloaded yet
|
||||
case 2:
|
||||
try {
|
||||
// Check for HTTP status 200
|
||||
if (xmlhttp.status != 200) {
|
||||
if (onStatus) {
|
||||
onStatus(
|
||||
xmlhttp.status,
|
||||
xmlhttp.statusText,
|
||||
xmlhttp
|
||||
);
|
||||
xmlhttp.abort();
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
Scholar.debug(e, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
// Called multiple while downloading in progress
|
||||
case 3:
|
||||
break;
|
||||
|
||||
// Download complete
|
||||
case 4:
|
||||
try {
|
||||
if (onDone) {
|
||||
onDone(xmlhttp.responseText, xmlhttp);
|
||||
}
|
||||
} catch (e) {
|
||||
Scholar.debug(e, 2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.Document
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/* Public properties:
|
||||
* browser - browser window object of document
|
||||
* model - data model for semantic scrapers
|
||||
* scraper - best scraper to use to scrape page
|
||||
*
|
||||
* Private properties:
|
||||
* _sandbox - sandbox for code execution
|
||||
* _progressDialog - dialog showing scrape progress
|
||||
*/
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Public Scholar.Ingester.Document methods
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
* Constructor for Document object
|
||||
*/
|
||||
Scholar.Ingester.Document = function(browserWindow){
|
||||
this.browser = browserWindow;
|
||||
this.scraper = null
|
||||
this.model = new Scholar.Ingester.Model();
|
||||
this._generateSandbox();
|
||||
}
|
||||
|
||||
/*
|
||||
* Retrieves the best scraper to scrape a given page
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.retrieveScraper = function() {
|
||||
Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);
|
||||
var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';
|
||||
var scrapers = Scholar.DB.query(sql);
|
||||
for(var i=0; i<scrapers.length; i++) {
|
||||
var currentScraper = scrapers[i];
|
||||
if(this.canScrape(currentScraper)) {
|
||||
this.scraper = currentScraper;
|
||||
Scholar.debug("Found scraper "+this.scraper.label);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if _scraper_ can scrape this document
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
|
||||
var canScrape = false;
|
||||
|
||||
// Test with regular expression
|
||||
// If this is slow, we could preload all scrapers and compile regular
|
||||
// expressions, so each check will be faster
|
||||
if(currentScraper.urlPattern) {
|
||||
var regularExpression = new RegExp(currentScraper.urlPattern, "i");
|
||||
if(regularExpression.test(this.browser.contentDocument.location.href)) {
|
||||
canScrape = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Test with JavaScript if available and didn't have a regular expression or
|
||||
// passed regular expression test
|
||||
if((!currentScraper.urlPattern || canScrape)
|
||||
&& currentScraper.scraperDetectCode) {
|
||||
var scraperSandbox = this.sandbox;
|
||||
try {
|
||||
canScrape = this.evalInSandbox("(function(){\n" +
|
||||
currentScraper.scraperDetectCode +
|
||||
"\n})()", scraperSandbox);
|
||||
} catch(e) {
|
||||
throw e+' in scraper '+currentScraper.label;
|
||||
}
|
||||
}
|
||||
return canScrape;
|
||||
}
|
||||
|
||||
/*
|
||||
* Populate model with semantic data regarding this page using _scraper_
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.scrapePage = function() {
|
||||
Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
|
||||
|
||||
var scraperSandbox = this.sandbox;
|
||||
|
||||
this._progressDialog = openDialog("chrome://scholar/content/ingester/scrape-progress.xul",
|
||||
"_blank", "chrome,all,dialog=no", null, null, null);
|
||||
|
||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||
|
||||
// If synchronous, call _scrapePageComplete();
|
||||
if(!scraperSandbox._waitForCompletion) {
|
||||
this._scrapePageComplete();
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Private Scholar.Ingester.Document methods
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
* Piggy Bank/FS offers four objects to JavaScript scrapers
|
||||
* browser - the object representing the open browser window containing the
|
||||
* document to be processes
|
||||
* doc - the DOM (basically just browser.contentDocument)
|
||||
* model - the object representing the RDF model of data to be returned
|
||||
* (see Scholar.Ingester.Model)
|
||||
* utilities - a set of utilities for making certain tasks easier
|
||||
* (see Scholar.Ingester.Utilities);
|
||||
*
|
||||
* Piggy Bank/FS also offers two functions to simplify asynchronous requests
|
||||
* (these will only be available for scraping, and not for scrape detection)
|
||||
* wait() - called on asynchronous requests so that Piggy Bank/FS will not
|
||||
* automatically return at the end of code execution
|
||||
* done() - when wait() is called, Piggy Bank/FS will wait for this
|
||||
* function before returning
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called when scraping (synchronous or asynchronous) is complete
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||
this._updateDatabase();
|
||||
this._progressDialog.close();
|
||||
}
|
||||
|
||||
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||
this.sandbox.browser = this.browser;
|
||||
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
||||
this.sandbox.utilities = new Scholar.Ingester.Utilities;
|
||||
this.sandbox.model = this.model;
|
||||
this.sandbox.XPathResult = XPathResult;
|
||||
|
||||
this.sandbox.wait = function(){ this._waitForCompletion = true; };
|
||||
this.sandbox.done = function(){ this._scrapePageComplete(); };
|
||||
}
|
||||
|
||||
/*
|
||||
* Add data ingested using RDF to database
|
||||
* (Ontologies are hard-coded until we have a real way of dealing with them)
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||
var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
||||
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
||||
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
||||
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
||||
|
||||
for(var uri in this.model.data) {
|
||||
var newItem = Scholar.Items.getNewItemByType(1);
|
||||
newItem.setField("source", uri);
|
||||
if(this.model.data[uri][prefixDC + 'title']) {
|
||||
newItem.setField("title", this.model.data[uri][prefixDC + 'title']);
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'publisher']) {
|
||||
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'year']) {
|
||||
data.date = this.model.data[uri][prefixDC + 'year'].substring(
|
||||
this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
|
||||
this.model.data[uri][prefixDC + 'year'].length);
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'edition']) {
|
||||
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'identifier']) {
|
||||
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'pages']) {
|
||||
newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages']);
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'creator']) {
|
||||
var creator = this.model.data[uri][prefixDC + 'creator'];
|
||||
|
||||
var spaceIndex = creator.lastIndexOf(" ");
|
||||
var firstName = creator.substring(spaceIndex+1, creator.length);
|
||||
var lastName = creator.substring(0, spaceIndex);
|
||||
|
||||
newItem.setCreator(0, firstName, lastName);
|
||||
}
|
||||
newItem.save();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" ?>
|
||||
<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
|
||||
<!DOCTYPE overlay SYSTEM "chrome://piggy-bank/locale/load-dom-dialog.dtd">
|
||||
|
||||
<window
|
||||
xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
|
||||
xmlns:xul="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
|
||||
xmlns:html="http://www.w3.org/1999/xhtml"
|
||||
id="scrape-progress"
|
||||
windowtype="Options"
|
||||
orient="vertical"
|
||||
screenX="10" screenY="10"
|
||||
persist="width height screenX screenY sizeMode"
|
||||
title="Scraping Page…"
|
||||
>
|
||||
|
||||
<hbox flex="1">
|
||||
<vbox flex="1" style="padding: 10px">
|
||||
<label value="Scraping Page…" />
|
||||
<progressmeter id="progress" mode="undetermined" />
|
||||
</vbox>
|
||||
<resizer id="window-resizer" dir="bottomright"/>
|
||||
<box style="visibility: collapse">
|
||||
<tabbrowser id="hidden-browser" />
|
||||
</box>
|
||||
</hbox>
|
||||
</window>
|
BIN
chrome/chromeFiles/skin/default/scholar/capture_colored.png
Normal file
BIN
chrome/chromeFiles/skin/default/scholar/capture_colored.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 668 B |
BIN
chrome/chromeFiles/skin/default/scholar/capture_gray.png
Normal file
BIN
chrome/chromeFiles/skin/default/scholar/capture_gray.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 669 B |
Loading…
Reference in a new issue