zotero/chrome/chromeFiles/content/scholar/xpcom/ingester.js

// Firefox Scholar Ingester
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL

Scholar.Ingester = new function() {}

/////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.Model
//
/////////////////////////////////////////////////////////////////

// Scholar.Ingester.Model, an object representing an RDF data model with
// methods to add to that model. In Piggy Bank, this was implemented in Java,
// but seeing as we don't really want an enormous web server running with FS,
// but we don't actually need that, so it's much simpler.
// 
// The Java version of this class can be viewed at
// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java
Scholar.Ingester.Model = function() {
	this.data = new Object();
}

// Piggy Bank provides a fourth argument, one that determines if the third
// argument is a literal or an RDF URI. Since our ontologies are
// sufficiently restricted, we have no chance of confusing a literal and an
// RDF URI and thus this is unnecessary.
Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
	if(!this.data[uri]) this.data[uri] = new Object();
	this.data[uri][rdfUri] = literal;
	Scholar.debug(rdfUri+" for "+uri+" is "+literal);
}

// Additional functions added for compatibility purposes only
// No idea if any scraper actually uses these, but just in case, they're
// implemented so as not to throw an exception
Scholar.Ingester.Model.prototype.addTag = function() {}
Scholar.Ingester.Model.prototype.getRepository = function() {}
Scholar.Ingester.Model.prototype.detachRepository = function() {}

/////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.Utilities
//
/////////////////////////////////////////////////////////////////
// Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project.
Scholar.Ingester.Utilities = function() {}

// Adapter for Piggy Bank function to print debug messages; log level is
// fixed at 4 (could change this)
Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {
	Scholar.debug(msg, 4);
}

// Appears to trim a string, chopping of newlines/spacing
Scholar.Ingester.Utilities.prototype.trimString = function(s) {
	var i = 0;
	var spaceChars = " \n\r\t" + String.fromCharCode(160) /* &nbsp; */;
	while (i < s.length) {
		var c = s.charAt(i);
		if (spaceChars.indexOf(c) < 0) {
			break;
		}
		i++;
	}
	
	s = s.substring(i);
	
	i = s.length;
	while (i > 0) {
		var c = s.charAt(i - 1);
		if (spaceChars.indexOf(c) < 0) {
			break;
		}
		i--;
	}
	
	return s.substring(0, i);
}

// Takes an XPath query and returns the results
Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
	var elmts = [];
	
	var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
	var elmt = iterator.iterateNext();
	var i = 0;
	while (elmt) {
		elmts[i++] = elmt;
		elmt = iterator.iterateNext();
	}
	return elmts;
}

// Loads a single document for a scraper, running succeeded() on success or
// failed() on failure
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
	this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
}

// Downloads and processes documents with processor()
// browser - a browser object
// firstDoc - the first document to process with the processor (if null, 
//            first document is processed without processor)
// urls - an array of URLs to load
// processor - a function to execute to process each document
// done - a function to execute when all document processing is complete
// exception - a function to execute if an exception occurs (exceptions are
//             also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
	try {
		if (urls.length == 0) {
			if (firstDoc) {
				processor(firstDoc, done);
			} else {
				done();
			}
			return;
		}
		
		var urlIndex = -1;
		var doLoad = function() {
			urlIndex++;
			if (urlIndex < urls.length) {
				try {
					var url = urls[urlIndex];
					var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
					b.loadURI(url);
				} catch (e) {
					exception(e);
					Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
				}
			} else {
				window.setTimeout(done, 10);
			}
		};
		var onLoad = function() {
			try {
				var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
				processor(b.contentDocument, doLoad);
			} catch (e) {
				exception(e);
				Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
			}
		};
		var init = function() {
			var listener;
			listener.onStateChange = function(webProgress, request, stateFlags, status) {
				if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
					request.name == urls[urlIndex]) {
					try {
						Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
					} catch (e) {
						exception(e);
						Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
					}
				}
			};
			
			var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
			tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
			
			if (firstDoc) {
				processor(firstDoc, doLoad);
			} else {
				doLoad();
			}
		}
		
		w.addEventListener("load", init, false);
	} catch (e) {
		exception(e);
		PB_Debug.print("processDocuments: " + e);
	}
}

// Appears to look for links in a document containing a certain substring
Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {
	var urls = [];
	var addedURLs = [];
	
	var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
	var aElement = aElements.iterateNext();
	while (aElement) {
		var href = aElement.href;
		if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
			urls.unshift(href);
			addedURLs[href] = true;
		}
		aElement = aElements.iterateNext();
	}
	return urls;
}

// For now, we're going to skip the getLLsFromAddresses function (which gets
// latitude and longitude pairs from a series of addresses, but requires the
// big mess of Java code that is the Piggy Bank server) and the geoHelper
// tools (which rely on getLLsFromAddresses) since these are probably not
// essential components for Scholar and would take a great deal of effort to
// implement. We can, however, always implement them later.

// It looks like these are simple front-ends for XMLHttpRequest. They're a
// component of the Piggy Bank API, so they're implemented here.
Scholar.Ingester.Utilities.HTTPUtilities = function() {}

Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
   var xmlhttp = new XMLHttpRequest();
   
   xmlhttp.open('GET', url, true);
   xmlhttp.overrideMimeType("text/xml");
   xmlhttp.onreadystatechange = function() {
	  Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
   };
   xmlhttp.send(null);
}

Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
   var xmlhttp = new XMLHttpRequest();
   
   xmlhttp.open('POST', url, true);
   xmlhttp.overrideMimeType("text/xml");
   xmlhttp.onreadystatechange = function() {
	  Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
   };
   xmlhttp.send(body);
}
	
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
   var xmlhttp = new XMLHttpRequest();
   
   xmlhttp.open('OPTIONS', url, true);
   xmlhttp.overrideMimeType("text/xml");
   xmlhttp.onreadystatechange = function() {
	  Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
   };
   xmlhttp.send(body);
}
	
// Possible point of failure; for some reason, this used to be a separate
// class, so make sure it works
Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
	switch (xmlhttp.readyState) {

		// Request not yet made
		case 1:
		break;

		// Contact established with server but nothing downloaded yet
		case 2:
			try {
				// Check for HTTP status 200
				if (xmlhttp.status != 200) {
					if (onStatus) {
						onStatus(
							xmlhttp.status,
							xmlhttp.statusText,
							xmlhttp
						);
						xmlhttp.abort();
					}
				}
			} catch (e) {
				Scholar.debug(e, 2);
			}
		break;

		// Called multiple while downloading in progress
		case 3:
		break;

		// Download complete
		case 4:
			try {
				if (onDone) {
					onDone(xmlhttp.responseText, xmlhttp);
				}
			} catch (e) {
				Scholar.debug(e, 2);
			}
		break;
	}
}
//////////////////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.Document
//
//////////////////////////////////////////////////////////////////////////////

/* Public properties:
 * browser - browser window object of document
 * model - data model for semantic scrapers
 * scraper - best scraper to use to scrape page
 *
 * Private properties:
 * _sandbox - sandbox for code execution
 */

//////////////////////////////////////////////////////////////////////////////
//
// Public Scholar.Ingester.Document methods
//
//////////////////////////////////////////////////////////////////////////////

/*
 * Constructor for Document object
 */
Scholar.Ingester.Document = function(browserWindow){
	this.browser = browserWindow;
	this.scraper = null
	this.model = new Scholar.Ingester.Model();
	this._generateSandbox();
}

/*
 * Retrieves the best scraper to scrape a given page
 */
Scholar.Ingester.Document.prototype.retrieveScraper = function() {
	Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);
	var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';
	var scrapers = Scholar.DB.query(sql);
	for(var i=0; i<scrapers.length; i++) {
		var currentScraper = scrapers[i];
		if(this.canScrape(currentScraper)) {
			this.scraper = currentScraper;
			Scholar.debug("Found scraper "+this.scraper.label);
			return true;
		}
	}
	return false;
}

/*
 * Check to see if _scraper_ can scrape this document
 */
Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
		var canScrape = false;
	
	// Test with regular expression
	// If this is slow, we could preload all scrapers and compile regular
	// expressions, so each check will be faster
	if(currentScraper.urlPattern) {
		var regularExpression = new RegExp(currentScraper.urlPattern, "i");
		if(regularExpression.test(this.browser.contentDocument.location.href)) {
			canScrape = true;
		}
	}
	
	// Test with JavaScript if available and didn't have a regular expression or
	// passed regular expression test
	if((!currentScraper.urlPattern || canScrape)
	  && currentScraper.scraperDetectCode) {
		var scraperSandbox = this.sandbox;
		try {
			canScrape = this.evalInSandbox("(function(){\n" +
							   currentScraper.scraperDetectCode +
							   "\n})()", scraperSandbox);
		} catch(e) {
			throw e+' in scraper '+currentScraper.label;
		}
	}
	return canScrape;
}

/*
 * Populate model with semantic data regarding this page using _scraper_
 * Callback will be executed once scraping is complete
 */
Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
	if(callback) {
		this._scrapeCallback = callback;
	}
	
	Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
	
	var scraperSandbox = this.sandbox;
	
	Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
	
	// If synchronous, call _scrapePageComplete();
	if(!scraperSandbox._waitForCompletion) {
		this._scrapePageComplete();
	}
}

//////////////////////////////////////////////////////////////////////////////
//
// Private Scholar.Ingester.Document methods
//
//////////////////////////////////////////////////////////////////////////////

/*
 * Piggy Bank/FS offers four objects to JavaScript scrapers
 * browser - the object representing the open browser window containing the
 *           document to be processes
 * doc - the DOM (basically just browser.contentDocument)
 * model - the object representing the RDF model of data to be returned
 *         (see Scholar.Ingester.Model)
 * utilities - a set of utilities for making certain tasks easier
 *             (see Scholar.Ingester.Utilities);
 *
 * Piggy Bank/FS also offers two functions to simplify asynchronous requests
 * (these will only be available for scraping, and not for scrape detection)
 * wait() - called on asynchronous requests so that Piggy Bank/FS will not
 *          automatically return at the end of code execution
 * done() - when wait() is called, Piggy Bank/FS will wait for this
 *          function before returning
 */

/*`
 * Called when scraping (synchronous or asynchronous) is complete
 */
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
	this._updateDatabase();
	if(this._scrapeCallback) {
		this._scrapeCallback();
	}
}
 
Scholar.Ingester.Document.prototype._generateSandbox = function() {
	this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
	this.sandbox.browser = this.browser;
	this.sandbox.doc = this.sandbox.browser.contentDocument;
	this.sandbox.utilities = new Scholar.Ingester.Utilities;
	this.sandbox.model = this.model;
	this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
	
	this.sandbox.wait = function(){ this._waitForCompletion = true; };
	this.sandbox.done = function(){ this._scrapePageComplete(); };
}

/*
 * Add data ingested using RDF to database
 * (Ontologies are hard-coded until we have a real way of dealing with them)
 */
Scholar.Ingester.Document.prototype._updateDatabase = function() {
	var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
	var prefixDC = 'http://purl.org/dc/elements/1.1/';
	var prefixDCMI = 'http://purl.org/dc/dcmitype/';
	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
	
	for(var uri in this.model.data) {
		var newItem = Scholar.Items.getNewItemByType(1);
		newItem.setField("source", uri);
		if(this.model.data[uri][prefixDC + 'title']) {
			newItem.setField("title", this.model.data[uri][prefixDC + 'title']);
		}
		if(this.model.data[uri][prefixDC + 'publisher']) {
			newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
		}
		if(this.model.data[uri][prefixDC + 'year']) {
			data.date = this.model.data[uri][prefixDC + 'year'].substring(
						 this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
						 this.model.data[uri][prefixDC + 'year'].length);
		}
		if(this.model.data[uri][prefixDC + 'edition']) {
			newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
		}
		if(this.model.data[uri][prefixDC + 'identifier']) {
			newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
		}
		if(this.model.data[uri][prefixDC + 'creator']) {
			var creator = this.model.data[uri][prefixDC + 'creator'];
			
			var spaceIndex = creator.lastIndexOf(" ");
			var lastName = creator.substring(spaceIndex+1, creator.length);
			var firstName = creator.substring(0, spaceIndex);
			
			newItem.setCreator(0, firstName, lastName);
		}
		newItem.save();
	}
}
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`// Firefox Scholar Ingester`
			`// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)`
			`// This code is licensed according to the GPL`

			`Scholar.Ingester = new function() {}`

			`/////////////////////////////////////////////////////////////////`
			`//`
			`// Scholar.Ingester.Model`
			`//`
			`/////////////////////////////////////////////////////////////////`

			`// Scholar.Ingester.Model, an object representing an RDF data model with`
			`// methods to add to that model. In Piggy Bank, this was implemented in Java,`
			`// but seeing as we don't really want an enormous web server running with FS,`
			`// but we don't actually need that, so it's much simpler.`
			`//`
			`// The Java version of this class can be viewed at`
			`// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java`
			`Scholar.Ingester.Model = function() {`
			`this.data = new Object();`
			`}`

			`// Piggy Bank provides a fourth argument, one that determines if the third`
			`// argument is a literal or an RDF URI. Since our ontologies are`
			`// sufficiently restricted, we have no chance of confusing a literal and an`
			`// RDF URI and thus this is unnecessary.`
			`Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {`
			`if(!this.data[uri]) this.data[uri] = new Object();`
			`this.data[uri][rdfUri] = literal;`
			`Scholar.debug(rdfUri+" for "+uri+" is "+literal);`
			`}`

			`// Additional functions added for compatibility purposes only`
			`// No idea if any scraper actually uses these, but just in case, they're`
			`// implemented so as not to throw an exception`
			`Scholar.Ingester.Model.prototype.addTag = function() {}`
			`Scholar.Ingester.Model.prototype.getRepository = function() {}`
			`Scholar.Ingester.Model.prototype.detachRepository = function() {}`

			`/////////////////////////////////////////////////////////////////`
			`//`
			`// Scholar.Ingester.Utilities`
			`//`
			`/////////////////////////////////////////////////////////////////`
			`// Scholar.Ingester.Utilities class, a set of methods to assist in data`
			`// extraction. Most code here was stolen directly from the Piggy Bank project.`
			`Scholar.Ingester.Utilities = function() {}`

			`// Adapter for Piggy Bank function to print debug messages; log level is`
			`// fixed at 4 (could change this)`
			`Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {`
			`Scholar.debug(msg, 4);`
			`}`

			`// Appears to trim a string, chopping of newlines/spacing`
			`Scholar.Ingester.Utilities.prototype.trimString = function(s) {`
			`var i = 0;`
			`var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */;`
			`while (i < s.length) {`
			`var c = s.charAt(i);`
			`if (spaceChars.indexOf(c) < 0) {`
			`break;`
			`}`
			`i++;`
			`}`

			`s = s.substring(i);`

			`i = s.length;`
			`while (i > 0) {`
			`var c = s.charAt(i - 1);`
			`if (spaceChars.indexOf(c) < 0) {`
			`break;`
			`}`
			`i--;`
			`}`

			`return s.substring(0, i);`
			`}`

			`// Takes an XPath query and returns the results`
			`Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {`
			`var elmts = [];`

XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`var elmt = iterator.iterateNext();`
			`var i = 0;`
			`while (elmt) {`
			`elmts[i++] = elmt;`
			`elmt = iterator.iterateNext();`
			`}`
			`return elmts;`
			`}`

			`// Loads a single document for a scraper, running succeeded() on success or`
			`// failed() on failure`
			`Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {`
			`this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);`
			`}`

			`// Downloads and processes documents with processor()`
			`// browser - a browser object`
			`// firstDoc - the first document to process with the processor (if null,`
			`// first document is processed without processor)`
			`// urls - an array of URLs to load`
			`// processor - a function to execute to process each document`
			`// done - a function to execute when all document processing is complete`
			`// exception - a function to execute if an exception occurs (exceptions are`
			`// also logged in the Firefox Scholar log)`
			`Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {`
			`try {`
			`if (urls.length == 0) {`
			`if (firstDoc) {`
			`processor(firstDoc, done);`
			`} else {`
			`done();`
			`}`
			`return;`
			`}`

			`var urlIndex = -1;`
			`var doLoad = function() {`
			`urlIndex++;`
			`if (urlIndex < urls.length) {`
			`try {`
			`var url = urls[urlIndex];`
			`var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");`
			`b.loadURI(url);`
			`} catch (e) {`
			`exception(e);`
			`Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);`
			`}`
			`} else {`
			`window.setTimeout(done, 10);`
			`}`
			`};`
			`var onLoad = function() {`
			`try {`
			`var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;`
			`processor(b.contentDocument, doLoad);`
			`} catch (e) {`
			`exception(e);`
			`Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);`
			`}`
			`};`
			`var init = function() {`
			`var listener;`
			`listener.onStateChange = function(webProgress, request, stateFlags, status) {`
			`if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&`
			`request.name == urls[urlIndex]) {`
			`try {`
			`Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);`
			`} catch (e) {`
			`exception(e);`
			`Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);`
			`}`
			`}`
			`};`

			`var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");`
			`tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);`

			`if (firstDoc) {`
			`processor(firstDoc, doLoad);`
			`} else {`
			`doLoad();`
			`}`
			`}`

			`w.addEventListener("load", init, false);`
			`} catch (e) {`
			`exception(e);`
			`PB_Debug.print("processDocuments: " + e);`
			`}`
			`}`

			`// Appears to look for links in a document containing a certain substring`
			`Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {`
			`var urls = [];`
			`var addedURLs = [];`

XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`var aElement = aElements.iterateNext();`
			`while (aElement) {`
			`var href = aElement.href;`
			`if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {`
			`urls.unshift(href);`
			`addedURLs[href] = true;`
			`}`
			`aElement = aElements.iterateNext();`
			`}`
			`return urls;`
			`}`

			`// For now, we're going to skip the getLLsFromAddresses function (which gets`
			`// latitude and longitude pairs from a series of addresses, but requires the`
			`// big mess of Java code that is the Piggy Bank server) and the geoHelper`
			`// tools (which rely on getLLsFromAddresses) since these are probably not`
			`// essential components for Scholar and would take a great deal of effort to`
			`// implement. We can, however, always implement them later.`

			`// It looks like these are simple front-ends for XMLHttpRequest. They're a`
			`// component of the Piggy Bank API, so they're implemented here.`
			`Scholar.Ingester.Utilities.HTTPUtilities = function() {}`

			`Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {`
			`var xmlhttp = new XMLHttpRequest();`

			`xmlhttp.open('GET', url, true);`
			`xmlhttp.overrideMimeType("text/xml");`
			`xmlhttp.onreadystatechange = function() {`
			`Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);`
			`};`
			`xmlhttp.send(null);`
			`}`

			`Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {`
			`var xmlhttp = new XMLHttpRequest();`

			`xmlhttp.open('POST', url, true);`
			`xmlhttp.overrideMimeType("text/xml");`
			`xmlhttp.onreadystatechange = function() {`
			`Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);`
			`};`
			`xmlhttp.send(body);`
			`}`

			`Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {`
			`var xmlhttp = new XMLHttpRequest();`

			`xmlhttp.open('OPTIONS', url, true);`
			`xmlhttp.overrideMimeType("text/xml");`
			`xmlhttp.onreadystatechange = function() {`
			`Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);`
			`};`
			`xmlhttp.send(body);`
			`}`

			`// Possible point of failure; for some reason, this used to be a separate`
			`// class, so make sure it works`
			`Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {`
			`switch (xmlhttp.readyState) {`

			`// Request not yet made`
			`case 1:`
			`break;`

			`// Contact established with server but nothing downloaded yet`
			`case 2:`
			`try {`
			`// Check for HTTP status 200`
			`if (xmlhttp.status != 200) {`
			`if (onStatus) {`
			`onStatus(`
			`xmlhttp.status,`
			`xmlhttp.statusText,`
			`xmlhttp`
			`);`
			`xmlhttp.abort();`
			`}`
			`}`
			`} catch (e) {`
			`Scholar.debug(e, 2);`
			`}`
			`break;`

			`// Called multiple while downloading in progress`
			`case 3:`
			`break;`

			`// Download complete`
			`case 4:`
			`try {`
			`if (onDone) {`
			`onDone(xmlhttp.responseText, xmlhttp);`
			`}`
			`} catch (e) {`
			`Scholar.debug(e, 2);`
			`}`
			`break;`
			`}`
			`}`
			`//////////////////////////////////////////////////////////////////////////////`
			`//`
			`// Scholar.Ingester.Document`
			`//`
			`//////////////////////////////////////////////////////////////////////////////`

			`/* Public properties:`
			`* browser - browser window object of document`
			`* model - data model for semantic scrapers`
			`* scraper - best scraper to use to scrape page`
			`*`
			`* Private properties:`
			`* _sandbox - sandbox for code execution`
			`*/`

			`//////////////////////////////////////////////////////////////////////////////`
			`//`
			`// Public Scholar.Ingester.Document methods`
			`//`
			`//////////////////////////////////////////////////////////////////////////////`

			`/*`
			`* Constructor for Document object`
			`*/`
			`Scholar.Ingester.Document = function(browserWindow){`
			`this.browser = browserWindow;`
			`this.scraper = null`
			`this.model = new Scholar.Ingester.Model();`
			`this._generateSandbox();`
			`}`

			`/*`
			`* Retrieves the best scraper to scrape a given page`
			`*/`
			`Scholar.Ingester.Document.prototype.retrieveScraper = function() {`
			`Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);`
			`var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';`
			`var scrapers = Scholar.DB.query(sql);`
			`for(var i=0; i<scrapers.length; i++) {`
			`var currentScraper = scrapers[i];`
			`if(this.canScrape(currentScraper)) {`
			`this.scraper = currentScraper;`
			`Scholar.debug("Found scraper "+this.scraper.label);`
			`return true;`
			`}`
			`}`
			`return false;`
			`}`

			`/*`
			`* Check to see if _scraper_ can scrape this document`
			`*/`
			`Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {`
			`var canScrape = false;`

			`// Test with regular expression`
			`// If this is slow, we could preload all scrapers and compile regular`
			`// expressions, so each check will be faster`
			`if(currentScraper.urlPattern) {`
			`var regularExpression = new RegExp(currentScraper.urlPattern, "i");`
			`if(regularExpression.test(this.browser.contentDocument.location.href)) {`
			`canScrape = true;`
			`}`
			`}`

			`// Test with JavaScript if available and didn't have a regular expression or`
			`// passed regular expression test`
			`if((!currentScraper.urlPattern \|\| canScrape)`
			`&& currentScraper.scraperDetectCode) {`
			`var scraperSandbox = this.sandbox;`
			`try {`
			`canScrape = this.evalInSandbox("(function(){\n" +`
			`currentScraper.scraperDetectCode +`
			`"\n})()", scraperSandbox);`
			`} catch(e) {`
			`throw e+' in scraper '+currentScraper.label;`
			`}`
			`}`
			`return canScrape;`
			`}`

			`/*`
			`* Populate model with semantic data regarding this page using _scraper_`
XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`* Callback will be executed once scraping is complete`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`*/`
XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`Scholar.Ingester.Document.prototype.scrapePage = function(callback) {`
			`if(callback) {`
			`this._scrapeCallback = callback;`
			`}`

Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`Scholar.debug("Scraping "+this.browser.contentDocument.location.href);`

			`var scraperSandbox = this.sandbox;`

			`Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);`

			`// If synchronous, call _scrapePageComplete();`
			`if(!scraperSandbox._waitForCompletion) {`
			`this._scrapePageComplete();`
			`}`
			`}`

			`//////////////////////////////////////////////////////////////////////////////`
			`//`
			`// Private Scholar.Ingester.Document methods`
			`//`
			`//////////////////////////////////////////////////////////////////////////////`

			`/*`
			`* Piggy Bank/FS offers four objects to JavaScript scrapers`
			`* browser - the object representing the open browser window containing the`
			`* document to be processes`
			`* doc - the DOM (basically just browser.contentDocument)`
			`* model - the object representing the RDF model of data to be returned`
			`* (see Scholar.Ingester.Model)`
			`* utilities - a set of utilities for making certain tasks easier`
			`* (see Scholar.Ingester.Utilities);`
			`*`
			`* Piggy Bank/FS also offers two functions to simplify asynchronous requests`
			`* (these will only be available for scraping, and not for scrape detection)`
			`* wait() - called on asynchronous requests so that Piggy Bank/FS will not`
			`* automatically return at the end of code execution`
			`* done() - when wait() is called, Piggy Bank/FS will wait for this`
			`* function before returning`
			`*/`

XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			/*`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`* Called when scraping (synchronous or asynchronous) is complete`
			`*/`
			`Scholar.Ingester.Document.prototype._scrapePageComplete = function() {`
			`this._updateDatabase();`
XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`if(this._scrapeCallback) {`
			`this._scrapeCallback();`
			`}`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`}`

			`Scholar.Ingester.Document.prototype._generateSandbox = function() {`
			`this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);`
			`this.sandbox.browser = this.browser;`
			`this.sandbox.doc = this.sandbox.browser.contentDocument;`
			`this.sandbox.utilities = new Scholar.Ingester.Utilities;`
			`this.sandbox.model = this.model;`
XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00
			`this.sandbox.wait = function(){ this._waitForCompletion = true; };`
			`this.sandbox.done = function(){ this._scrapePageComplete(); };`
			`}`

			`/*`
			`* Add data ingested using RDF to database`
			`* (Ontologies are hard-coded until we have a real way of dealing with them)`
			`*/`
			`Scholar.Ingester.Document.prototype._updateDatabase = function() {`
			`var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';`
			`var prefixDC = 'http://purl.org/dc/elements/1.1/';`
			`var prefixDCMI = 'http://purl.org/dc/dcmitype/';`
			`var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';`

			`for(var uri in this.model.data) {`
			`var newItem = Scholar.Items.getNewItemByType(1);`
			`newItem.setField("source", uri);`
			`if(this.model.data[uri][prefixDC + 'title']) {`
			`newItem.setField("title", this.model.data[uri][prefixDC + 'title']);`
			`}`
			`if(this.model.data[uri][prefixDC + 'publisher']) {`
			`newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);`
			`}`
			`if(this.model.data[uri][prefixDC + 'year']) {`
			`data.date = this.model.data[uri][prefixDC + 'year'].substring(`
			`this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,`
			`this.model.data[uri][prefixDC + 'year'].length);`
			`}`
			`if(this.model.data[uri][prefixDC + 'edition']) {`
			`newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);`
			`}`
			`if(this.model.data[uri][prefixDC + 'identifier']) {`
			`newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));`
			`}`
			`if(this.model.data[uri][prefixDC + 'creator']) {`
			`var creator = this.model.data[uri][prefixDC + 'creator'];`

			`var spaceIndex = creator.lastIndexOf(" ");`
XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?) 2006-06-02 03:19:12 +00:00			`var lastName = creator.substring(spaceIndex+1, creator.length);`
			`var firstName = creator.substring(0, spaceIndex);`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00
			`newItem.setCreator(0, firstName, lastName);`
			`}`
			`newItem.save();`
			`}`
			`}`