Beginnings of search result scraping (does not yet actually do the scraping, but does present the menu)

This commit is contained in:
Simon Kornblith 2006-06-22 02:43:40 +00:00
parent 428eab6a95
commit ca3a0e6e5d
7 changed files with 144 additions and 43 deletions

View file

@ -61,11 +61,21 @@ Scholar_Ingester_Interface.chromeUnload = function() {
Scholar_Ingester_Interface.scrapeThisPage = function() {
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
if(documentObject.scraper) {
if(documentObject.scrapeURLList) {
// In the case that there are multiple scrapable URLs, make the user choose
Scholar_Ingester_Interface.chooseURL(documentObject);
}
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
}
}
Scholar_Ingester_Interface.chooseURL = function(documentObject) {
Scholar.debug("chooseURL called");
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
}
/*
* Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page
@ -108,7 +118,6 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
* appropriate status indicator for the current tab, and to free useless objects
*/
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
Scholar.debug("onLocationChange called");
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
// Remove document object of any browser that no longer exists
@ -130,25 +139,6 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject)
Scholar_Ingester_Interface._deleteDocument(browser);
}
}
/*// Add a collector to any new browser
for (var i = 0; i < browsers.length; i++) {
var browser = browsers[i];
var exists = false;
for (var j = 0; j < Scholar_Ingester_Interface.browsers.length; j++) {
if (browser == Scholar_Ingester_Interface.browsers[j]) {
exists = true;
break;
}
}
if (!exists) {
Scholar_Ingester_Interface.browsers.splice(i,0,browser);
// To execute if window is new
}
}*/
Scholar_Ingester_Interface.updateStatus(
Scholar_Ingester_Interface.tabBrowser.selectedBrowser

View file

@ -1,12 +1,8 @@
<?xml version="1.0"?>
<!-- Note: Contains Firefox-specific overlay -->
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
<script src="../include.js"/>
<script src="browser.js"/>
<script type="application/x-javascript">

View file

@ -0,0 +1,44 @@
//////////////////////////////////////////////////////////////////////////////
//
// Scholar_Ingester_Interface_SelectItems
//
//////////////////////////////////////////////////////////////////////////////
// Class to interface with the browser when ingesting data
Scholar_Ingester_Interface_SelectItems = function() {}
//////////////////////////////////////////////////////////////////////////////
//
// Public Scholar_Ingester_Interface_SelectItems methods
//
//////////////////////////////////////////////////////////////////////////////
/*
* Initialize some variables and prepare event listeners for when chrome is done
* loading
*/
Scholar_Ingester_Interface_SelectItems.init = function() {
this.documentObject = window.arguments[0];
this.listbox = document.getElementById("scholar-selectitems-links");
for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to
var itemNode = document.createElement("listitem");
itemNode.setAttribute("type", "checkbox");
itemNode.setAttribute("value", i);
itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]);
itemNode.setAttribute("checked", false);
this.listbox.appendChild(itemNode);
}
}
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
// clear scrapeURLList
this.documentObject.scrapeURLList = new Object();
// collect scrapeURLList from listbox
for(var i=0; i<this.listbox.length; i++) {
var itemNode = this.listbox[i];
this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
}
}

View file

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<!--
Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivion)
-->
<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
<!DOCTYPE window SYSTEM "chrome://scholar/locale/scholar.dtd">
<dialog xmlns:html="http://www.w3.org/1999/xhtml"
xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
title="&selectitems.title;" width="400" height="330"
persist="width height screenX screenY"
buttons="cancel,accept"
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
ondialogcancel="self.close()"
id="scholar-selectitems"
onload="Scholar_Ingester_Interface_SelectItems.init()">
<script src="../include.js"/>
<script src="selectitems.js"/>
<caption label="&selectitems.intro.label;" id="scholar-selectitems-intro"/>
<box flex="1">
<listbox id="scholar-selectitems-links" flex="1" context="scholarSelectContext"></listbox>
</box>
</dialog>

View file

@ -283,7 +283,7 @@ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
*/
Scholar.Ingester.Utilities.prototype.cleanString = function(s) {
s = this.trimString(s);
return s.replace(/ +/g, " ");
return s.replace(/[ \xA0]+/g, " ");
}
/*
@ -569,7 +569,15 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
"\n})()", scraperSandbox);
} catch(e) {
Scholar.debug(e+' in scraperDetectCode for '+currentScraper.label);
canScrape = false;
return false;
}
// scraperDetectCode returns an associative array (object) in the case of a search result
if(typeof(canScrape) == "object") {
Scholar.debug("scraperDetectCode returned a URL list");
this.scrapeURLList = canScrape;
} else {
Scholar.debug("canScrape was a "+typeof(canScrape));
}
}
return canScrape;

View file

@ -24,4 +24,9 @@
<!ENTITY toolbar.newCollection.label "New Project">
<!ENTITY toolbar.renameCollection.label "Rename Project...">
<!ENTITY toolbar.removeCollection.label "Remove Project...">
<!ENTITY toolbar.search.label "Search:">
<!ENTITY toolbar.search.label "Search:">
<!ENTITY selectitems.title "Select Items">
<!ENTITY selectitems.intro.label "Select which items you'd like to add to your library">
<!ENTITY selectitems.cancel.label "Cancel">
<!ENTITY selectitems.select.label "OK">

View file

@ -175,24 +175,59 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
wait();');
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-18 11:02:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
'try {
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
return false;
}
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
for(i in export_options) {
if(export_options[i].text == ''Latin1 MARC''
|| export_options[i].text == ''Raw MARC''
|| export_options[i].text == ''UTF-8''
|| export_options[i].text == ''MARC (Unicode/UTF-8)''
|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
return true;
'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
// We have search results
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this
var tagRegexp = new RegExp();
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
// Do not allow text to match this
var rejectRegexp = new RegExp();
rejectRegexp.compile(''\[ [0-9]+ \]'');
var links = doc.getElementsByTagName("a");
for(var i=0; i<links.length; i++) {
if(tagRegexp.test(links[i].href)) {
var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
if(text) {
text = utilities.cleanString(text);
if(!rejectRegexp.test(text)) {
if(availableItems[links[i].href]) {
availableItems[links[i].href] += " "+text;
} else {
availableItems[links[i].href] = text;
}
}
}
}
}
return false;
} catch(e) {
return false;
}',
if(availableItems) {
return availableItems;
} else {
return false;
}
}
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
for(i in export_options) {
if(export_options[i].text == ''Latin1 MARC''
|| export_options[i].text == ''Raw MARC''
|| export_options[i].text == ''UTF-8''
|| export_options[i].text == ''MARC (Unicode/UTF-8)''
|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
// We have an exportable single record
return true;
}
}
return false;',
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';