Still getting the hang of Subversion...the rest of the ingester code

2006-06-01 06:53:39 +00:00 · 2006-06-01 06:53:39 +00:00 · 551582eb7e
commit 551582eb7e
parent 24d35c5547
6 changed files with 663 additions and 0 deletions
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@ -0,0 +1,140 @@
+// Firefox Scholar Ingester Browser Functions
+// Utilities based on code taken from Greasemonkey
+// This code is licensed according to the GPL
+
+// Prepare the browser and collector instrumentation caches --------------------
+Scholar.Ingester.Interface = function() {}
+
+Scholar.Ingester.Interface.init = function() {
+	Scholar.Ingester.Interface.browsers = new Array();
+	
+    window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false);
+    window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false);
+    
+	Scholar.Ingester.Interface.browsers = new Array();
+	Scholar.Ingester.Interface.browserDocuments = new Object(); 
+}
+
+Scholar.Ingester.Interface.chromeLoad = function() {
+	Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
+	Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
+	Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
+	
+	// this gives us onLocationChange
+	Scholar.Ingester.Interface.tabBrowser.addProgressListener(Scholar.Ingester.Interface.Listener,
+		Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
+	// this gives us DOMContentLoaded
+    Scholar.Ingester.Interface.appContent.addEventListener("DOMContentLoaded",
+    	Scholar.Ingester.Interface.contentLoad, true);
+}
+
+Scholar.Ingester.Interface.chromeUnload = function() {
+  this.tabBrowser.removeProgressListener(this);
+}
+
+Scholar.Ingester.Interface.getDocument = function(browser) {
+	try {
+		var key = browser.getAttribute("scholar-key");
+		if(Scholar.Ingester.Interface.browserDocuments[key]) {
+			return Scholar.Ingester.Interface.browserDocuments[key];
+		}
+	} finally {}
+	return false;
+}
+
+Scholar.Ingester.Interface.setDocument = function(browser) {
+	try {
+		var key = browser.getAttribute("scholar-key");
+	} finally {
+		if(!key) {
+			var key = (new Date()).getTime();
+			browser.setAttribute("scholar-key", key);
+		}
+	}
+	Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
+	Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
+}
+
+Scholar.Ingester.Interface.deleteDocument = function(browser) {
+	try {
+		var key = browser.getAttribute("scholar-key");
+		if(Scholar.Ingester.Interface.browserDocuments[key]) {
+			delete Scholar.Ingester.Interface.browserDocuments[key];
+			return true;
+		}
+	} finally {}
+	return false;
+}
+
+Scholar.Ingester.Interface.scrapeThisPage = function() {
+	var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
+	if(document.scraper) {
+		document.scrapePage();
+	}
+}
+
+Scholar.Ingester.Interface.updateStatus = function(browser) {
+	var document = Scholar.Ingester.Interface.getDocument(browser);
+	if(document && document.scraper) {
+		this.statusImage.src = "chrome://scholar/skin/capture_colored.png";
+	} else {
+		this.statusImage.src = "chrome://scholar/skin/capture_gray.png";
+	}
+}
+
+Scholar.Ingester.Interface.contentLoad = function() {
+	Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
+	Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
+}
+
+Scholar.Ingester.Interface.Listener = function() {}
+Scholar.Ingester.Interface.Listener.onStatusChange = function() {}
+Scholar.Ingester.Interface.Listener.onSecurityChange = function() {}
+Scholar.Ingester.Interface.Listener.onProgressChange = function() {}
+Scholar.Ingester.Interface.Listener.onStateChange = function() {}
+Scholar.Ingester.Interface.Listener.onLocationChange = function() {
+    var browsers = Scholar.Ingester.Interface.tabBrowser.browsers;
+
+    // Remove document object of any browser that no longer exists
+    for (var i = 0; i < Scholar.Ingester.Interface.browsers.length; i++) {
+        var browser = Scholar.Ingester.Interface.browsers[i];
+        var exists = false;
+
+        for (var j = 0; j < browsers.length; j++) {
+            if (browser == browsers[j]) {
+                exists = true;
+                break;
+            }
+        }
+
+        if (!exists) {
+            Scholar.Ingester.Interface.browsers.splice(i,1);
+
+        	// To execute if document object does not exist
+            Scholar.Ingester.Interface.deleteDocument(browser);
+        }
+    }
+
+    /*// Add a collector to any new browser
+    for (var i = 0; i < browsers.length; i++) {
+        var browser = browsers[i];
+        var exists = false;
+
+        for (var j = 0; j < Scholar.Ingester.Interface.browsers.length; j++) {
+            if (browser == Scholar.Ingester.Interface.browsers[j]) {
+                exists = true;
+                break;
+            }
+        }
+
+        if (!exists) {
+            Scholar.Ingester.Interface.browsers.splice(i,0,browser);
+            
+        	// To execute if window is new
+        }
+    }*/
+
+    Scholar.Ingester.Interface.updateStatus(
+    	Scholar.Ingester.Interface.tabBrowser.selectedBrowser
+    );
+}
--- a/chrome/chromeFiles/content/scholar/ingester/browser.xul
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.xul
@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+
+
+<!-- Note: Contains Firefox-specific overlay -->
+
+<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
+
+	<script src="../include.js"/>
+	
+    <script src="ingester.js"/>
+    <script src="browser.js"/>
+    
+	<script type="application/x-javascript">
+		Scholar.Ingester.Interface.init();
+	</script>
+    
+    <statusbar id="status-bar">
+        <statusbarpanel id="schlar-status" insertafter="livemark-button">
+            <label id="scholar-status-label" collapsed="true" crop="end" style="width:0px" />
+            <image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
+         </statusbarpanel>
+    </statusbar>
+</overlay>
--- a/chrome/chromeFiles/content/scholar/ingester/ingester.js
+++ b/chrome/chromeFiles/content/scholar/ingester/ingester.js
@ -0,0 +1,473 @@
+// Firefox Scholar Ingester
+// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
+// This code is licensed according to the GPL
+
+Scholar.Ingester = new function() {}
+
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.Model
+//
+/////////////////////////////////////////////////////////////////
+
+// Scholar.Ingester.Model, an object representing an RDF data model with
+// methods to add to that model. In Piggy Bank, this was implemented in Java,
+// but seeing as we don't really want an enormous web server running with FS,
+// but we don't actually need that, so it's much simpler.
+// 
+// The Java version of this class can be viewed at
+// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java
+Scholar.Ingester.Model = function() {
+	this.data = new Object();
+}
+
+// Piggy Bank provides a fourth argument, one that determines if the third
+// argument is a literal or an RDF URI. Since our ontologies are
+// sufficiently restricted, we have no chance of confusing a literal and an
+// RDF URI and thus this is unnecessary.
+Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
+	if(!this.data[uri]) this.data[uri] = new Object();
+	this.data[uri][rdfUri] = literal;
+	Scholar.debug(rdfUri+" for "+uri+" is "+literal);
+}
+
+// Additional functions added for compatibility purposes only
+// No idea if any scraper actually uses these, but just in case, they're
+// implemented so as not to throw an exception
+Scholar.Ingester.Model.prototype.addTag = function() {}
+Scholar.Ingester.Model.prototype.getRepository = function() {}
+Scholar.Ingester.Model.prototype.detachRepository = function() {}
+
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.Utilities
+//
+/////////////////////////////////////////////////////////////////
+// Scholar.Ingester.Utilities class, a set of methods to assist in data
+// extraction. Most code here was stolen directly from the Piggy Bank project.
+Scholar.Ingester.Utilities = function() {}
+
+// Adapter for Piggy Bank function to print debug messages; log level is
+// fixed at 4 (could change this)
+Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {
+	Scholar.debug(msg, 4);
+}
+
+// Appears to trim a string, chopping of newlines/spacing
+Scholar.Ingester.Utilities.prototype.trimString = function(s) {
+	var i = 0;
+	var spaceChars = " \n\r\t" + String.fromCharCode(160) /* &nbsp; */;
+	while (i < s.length) {
+		var c = s.charAt(i);
+		if (spaceChars.indexOf(c) < 0) {
+			break;
+		}
+		i++;
+	}
+	
+	s = s.substring(i);
+	
+	i = s.length;
+	while (i > 0) {
+		var c = s.charAt(i - 1);
+		if (spaceChars.indexOf(c) < 0) {
+			break;
+		}
+		i--;
+	}
+	
+	return s.substring(0, i);
+}
+
+// Takes an XPath query and returns the results
+Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
+	var elmts = [];
+	
+	var iterator = doc.evaluate(xpath, parentNode, nsResolver, XPathResult.ANY_TYPE,null);
+	var elmt = iterator.iterateNext();
+	var i = 0;
+	while (elmt) {
+		elmts[i++] = elmt;
+		elmt = iterator.iterateNext();
+	}
+	return elmts;
+}
+
+// Loads a single document for a scraper, running succeeded() on success or
+// failed() on failure
+Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
+	this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
+}
+
+// Downloads and processes documents with processor()
+// browser - a browser object
+// firstDoc - the first document to process with the processor (if null, 
+//            first document is processed without processor)
+// urls - an array of URLs to load
+// processor - a function to execute to process each document
+// done - a function to execute when all document processing is complete
+// exception - a function to execute if an exception occurs (exceptions are
+//             also logged in the Firefox Scholar log)
+Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+	try {
+		if (urls.length == 0) {
+			if (firstDoc) {
+				processor(firstDoc, done);
+			} else {
+				done();
+			}
+			return;
+		}
+		
+		var urlIndex = -1;
+		var doLoad = function() {
+			urlIndex++;
+			if (urlIndex < urls.length) {
+				try {
+					var url = urls[urlIndex];
+					var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
+					b.loadURI(url);
+				} catch (e) {
+					exception(e);
+					Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
+				}
+			} else {
+				window.setTimeout(done, 10);
+			}
+		};
+		var onLoad = function() {
+			try {
+				var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
+				processor(b.contentDocument, doLoad);
+			} catch (e) {
+				exception(e);
+				Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+			}
+		};
+		var init = function() {
+			var listener;
+			listener.onStateChange = function(webProgress, request, stateFlags, status) {
+				if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
+					request.name == urls[urlIndex]) {
+					try {
+						Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
+					} catch (e) {
+						exception(e);
+						Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
+					}
+				}
+			};
+			
+			var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
+			tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
+			
+			if (firstDoc) {
+				processor(firstDoc, doLoad);
+			} else {
+				doLoad();
+			}
+		}
+		
+		w.addEventListener("load", init, false);
+	} catch (e) {
+		exception(e);
+		PB_Debug.print("processDocuments: " + e);
+	}
+}
+
+// Appears to look for links in a document containing a certain substring
+Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {
+	var urls = [];
+	var addedURLs = [];
+	
+	var aElements = doc.evaluate("//a", doc, null, XPathResult.ANY_TYPE,null);
+	var aElement = aElements.iterateNext();
+	while (aElement) {
+		var href = aElement.href;
+		if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
+			urls.unshift(href);
+			addedURLs[href] = true;
+		}
+		aElement = aElements.iterateNext();
+	}
+	return urls;
+}
+
+// For now, we're going to skip the getLLsFromAddresses function (which gets
+// latitude and longitude pairs from a series of addresses, but requires the
+// big mess of Java code that is the Piggy Bank server) and the geoHelper
+// tools (which rely on getLLsFromAddresses) since these are probably not
+// essential components for Scholar and would take a great deal of effort to
+// implement. We can, however, always implement them later.
+
+// It looks like these are simple front-ends for XMLHttpRequest. They're a
+// component of the Piggy Bank API, so they're implemented here.
+Scholar.Ingester.Utilities.HTTPUtilities = function() {}
+
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
+   var xmlhttp = new XMLHttpRequest();
+   
+   xmlhttp.open('GET', url, true);
+   xmlhttp.overrideMimeType("text/xml");
+   xmlhttp.onreadystatechange = function() {
+	  Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
+   };
+   xmlhttp.send(null);
+}
+
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
+   var xmlhttp = new XMLHttpRequest();
+   
+   xmlhttp.open('POST', url, true);
+   xmlhttp.overrideMimeType("text/xml");
+   xmlhttp.onreadystatechange = function() {
+	  Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
+   };
+   xmlhttp.send(body);
+}
+	
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
+   var xmlhttp = new XMLHttpRequest();
+   
+   xmlhttp.open('OPTIONS', url, true);
+   xmlhttp.overrideMimeType("text/xml");
+   xmlhttp.onreadystatechange = function() {
+	  Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
+   };
+   xmlhttp.send(body);
+}
+	
+// Possible point of failure; for some reason, this used to be a separate
+// class, so make sure it works
+Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
+	switch (xmlhttp.readyState) {
+
+		// Request not yet made
+		case 1:
+		break;
+
+		// Contact established with server but nothing downloaded yet
+		case 2:
+			try {
+				// Check for HTTP status 200
+				if (xmlhttp.status != 200) {
+					if (onStatus) {
+						onStatus(
+							xmlhttp.status,
+							xmlhttp.statusText,
+							xmlhttp
+						);
+						xmlhttp.abort();
+					}
+				}
+			} catch (e) {
+				Scholar.debug(e, 2);
+			}
+		break;
+
+		// Called multiple while downloading in progress
+		case 3:
+		break;
+
+		// Download complete
+		case 4:
+			try {
+				if (onDone) {
+					onDone(xmlhttp.responseText, xmlhttp);
+				}
+			} catch (e) {
+				Scholar.debug(e, 2);
+			}
+		break;
+	}
+}
+//////////////////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.Document
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/* Public properties:
+ * browser - browser window object of document
+ * model - data model for semantic scrapers
+ * scraper - best scraper to use to scrape page
+ *
+ * Private properties:
+ * _sandbox - sandbox for code execution
+ * _progressDialog - dialog showing scrape progress
+ */
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Public Scholar.Ingester.Document methods
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Constructor for Document object
+ */
+Scholar.Ingester.Document = function(browserWindow){
+	this.browser = browserWindow;
+	this.scraper = null
+	this.model = new Scholar.Ingester.Model();
+	this._generateSandbox();
+}
+
+/*
+ * Retrieves the best scraper to scrape a given page
+ */
+Scholar.Ingester.Document.prototype.retrieveScraper = function() {
+	Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);
+	var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';
+	var scrapers = Scholar.DB.query(sql);
+	for(var i=0; i<scrapers.length; i++) {
+		var currentScraper = scrapers[i];
+		if(this.canScrape(currentScraper)) {
+			this.scraper = currentScraper;
+			Scholar.debug("Found scraper "+this.scraper.label);
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ * Check to see if _scraper_ can scrape this document
+ */
+Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
+		var canScrape = false;
+	
+	// Test with regular expression
+	// If this is slow, we could preload all scrapers and compile regular
+	// expressions, so each check will be faster
+	if(currentScraper.urlPattern) {
+		var regularExpression = new RegExp(currentScraper.urlPattern, "i");
+		if(regularExpression.test(this.browser.contentDocument.location.href)) {
+			canScrape = true;
+		}
+	}
+	
+	// Test with JavaScript if available and didn't have a regular expression or
+	// passed regular expression test
+	if((!currentScraper.urlPattern || canScrape)
+	  && currentScraper.scraperDetectCode) {
+		var scraperSandbox = this.sandbox;
+		try {
+			canScrape = this.evalInSandbox("(function(){\n" +
+							   currentScraper.scraperDetectCode +
+							   "\n})()", scraperSandbox);
+		} catch(e) {
+			throw e+' in scraper '+currentScraper.label;
+		}
+	}
+	return canScrape;
+}
+
+/*
+ * Populate model with semantic data regarding this page using _scraper_
+ */
+Scholar.Ingester.Document.prototype.scrapePage = function() {
+	Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
+	
+	var scraperSandbox = this.sandbox;
+	
+	this._progressDialog = openDialog("chrome://scholar/content/ingester/scrape-progress.xul", 
+		"_blank", "chrome,all,dialog=no", null, null, null);
+		
+	Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
+	
+	// If synchronous, call _scrapePageComplete();
+	if(!scraperSandbox._waitForCompletion) {
+		this._scrapePageComplete();
+	}
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Private Scholar.Ingester.Document methods
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Piggy Bank/FS offers four objects to JavaScript scrapers
+ * browser - the object representing the open browser window containing the
+ *           document to be processes
+ * doc - the DOM (basically just browser.contentDocument)
+ * model - the object representing the RDF model of data to be returned
+ *         (see Scholar.Ingester.Model)
+ * utilities - a set of utilities for making certain tasks easier
+ *             (see Scholar.Ingester.Utilities);
+ *
+ * Piggy Bank/FS also offers two functions to simplify asynchronous requests
+ * (these will only be available for scraping, and not for scrape detection)
+ * wait() - called on asynchronous requests so that Piggy Bank/FS will not
+ *          automatically return at the end of code execution
+ * done() - when wait() is called, Piggy Bank/FS will wait for this
+ *          function before returning
+ */
+
+/*
+ * Called when scraping (synchronous or asynchronous) is complete
+ */
+Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
+	this._updateDatabase();
+	this._progressDialog.close();
+}
+ 
+Scholar.Ingester.Document.prototype._generateSandbox = function() {
+	this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
+	this.sandbox.browser = this.browser;
+	this.sandbox.doc = this.sandbox.browser.contentDocument;
+	this.sandbox.utilities = new Scholar.Ingester.Utilities;
+	this.sandbox.model = this.model;
+	this.sandbox.XPathResult = XPathResult;
+	
+	this.sandbox.wait = function(){ this._waitForCompletion = true; };
+	this.sandbox.done = function(){ this._scrapePageComplete(); };
+}
+
+/*
+ * Add data ingested using RDF to database
+ * (Ontologies are hard-coded until we have a real way of dealing with them)
+ */
+Scholar.Ingester.Document.prototype._updateDatabase = function() {
+	var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
+	var prefixDC = 'http://purl.org/dc/elements/1.1/';
+	var prefixDCMI = 'http://purl.org/dc/dcmitype/';
+	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
+	
+	for(var uri in this.model.data) {
+		var newItem = Scholar.Items.getNewItemByType(1);
+		newItem.setField("source", uri);
+		if(this.model.data[uri][prefixDC + 'title']) {
+			newItem.setField("title", this.model.data[uri][prefixDC + 'title']);
+		}
+		if(this.model.data[uri][prefixDC + 'publisher']) {
+			newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
+		}
+		if(this.model.data[uri][prefixDC + 'year']) {
+			data.date = this.model.data[uri][prefixDC + 'year'].substring(
+						 this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
+						 this.model.data[uri][prefixDC + 'year'].length);
+		}
+		if(this.model.data[uri][prefixDC + 'edition']) {
+			newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
+		}
+		if(this.model.data[uri][prefixDC + 'identifier']) {
+			newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
+		}
+		if(this.model.data[uri][prefixDummy + 'pages']) {
+			newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages']);
+		}
+		if(this.model.data[uri][prefixDC + 'creator']) {
+			var creator = this.model.data[uri][prefixDC + 'creator'];
+			
+			var spaceIndex = creator.lastIndexOf(" ");
+			var firstName = creator.substring(spaceIndex+1, creator.length);
+			var lastName = creator.substring(0, spaceIndex);
+			
+			newItem.setCreator(0, firstName, lastName);
+		}
+		newItem.save();
+	}
+}
--- a/chrome/chromeFiles/content/scholar/ingester/scrape-progress.xul
+++ b/chrome/chromeFiles/content/scholar/ingester/scrape-progress.xul
@ -0,0 +1,27 @@
+<?xml version="1.0" ?>
+<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
+<!DOCTYPE overlay SYSTEM "chrome://piggy-bank/locale/load-dom-dialog.dtd">
+
+<window
+	xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
+	xmlns:xul="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
+	xmlns:html="http://www.w3.org/1999/xhtml"
+	id="scrape-progress" 
+	windowtype="Options"
+	orient="vertical"
+	screenX="10" screenY="10"
+	persist="width height screenX screenY sizeMode"
+	title="Scraping Page&#8230;" 
+>
+	
+	<hbox flex="1">
+		<vbox flex="1" style="padding: 10px">
+			<label value="Scraping Page&#8230;" />
+			<progressmeter id="progress" mode="undetermined" />
+		</vbox>
+		<resizer id="window-resizer" dir="bottomright"/>
+		<box style="visibility: collapse">
+			<tabbrowser id="hidden-browser" />
+		</box>
+	</hbox>
+</window>
--- a/chrome/chromeFiles/skin/default/scholar/capture_colored.png
+++ b/chrome/chromeFiles/skin/default/scholar/capture_colored.png
--- a/chrome/chromeFiles/skin/default/scholar/capture_gray.png
+++ b/chrome/chromeFiles/skin/default/scholar/capture_gray.png