Beginnings of search result scraping (does not yet actually do the scraping, but does present the menu)
This commit is contained in:
parent
428eab6a95
commit
ca3a0e6e5d
7 changed files with 144 additions and 43 deletions
|
@ -61,11 +61,21 @@ Scholar_Ingester_Interface.chromeUnload = function() {
|
|||
Scholar_Ingester_Interface.scrapeThisPage = function() {
|
||||
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
if(documentObject.scraper) {
|
||||
if(documentObject.scrapeURLList) {
|
||||
// In the case that there are multiple scrapable URLs, make the user choose
|
||||
Scholar_Ingester_Interface.chooseURL(documentObject);
|
||||
}
|
||||
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
|
||||
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.chooseURL = function(documentObject) {
|
||||
Scholar.debug("chooseURL called");
|
||||
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
||||
"_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
|
||||
}
|
||||
|
||||
/*
|
||||
* Updates the status of the capture icon to reflect the scrapability or lack
|
||||
* thereof of the current page
|
||||
|
@ -108,7 +118,6 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
|
|||
* appropriate status indicator for the current tab, and to free useless objects
|
||||
*/
|
||||
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
|
||||
Scholar.debug("onLocationChange called");
|
||||
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
|
||||
|
||||
// Remove document object of any browser that no longer exists
|
||||
|
@ -130,25 +139,6 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject)
|
|||
Scholar_Ingester_Interface._deleteDocument(browser);
|
||||
}
|
||||
}
|
||||
|
||||
/*// Add a collector to any new browser
|
||||
for (var i = 0; i < browsers.length; i++) {
|
||||
var browser = browsers[i];
|
||||
var exists = false;
|
||||
|
||||
for (var j = 0; j < Scholar_Ingester_Interface.browsers.length; j++) {
|
||||
if (browser == Scholar_Ingester_Interface.browsers[j]) {
|
||||
exists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!exists) {
|
||||
Scholar_Ingester_Interface.browsers.splice(i,0,browser);
|
||||
|
||||
// To execute if window is new
|
||||
}
|
||||
}*/
|
||||
|
||||
Scholar_Ingester_Interface.updateStatus(
|
||||
Scholar_Ingester_Interface.tabBrowser.selectedBrowser
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
|
||||
<!-- Note: Contains Firefox-specific overlay -->
|
||||
|
||||
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
|
||||
|
||||
<script src="../include.js"/>
|
||||
|
||||
<script src="browser.js"/>
|
||||
|
||||
<script type="application/x-javascript">
|
||||
|
|
44
chrome/chromeFiles/content/scholar/ingester/selectitems.js
Normal file
44
chrome/chromeFiles/content/scholar/ingester/selectitems.js
Normal file
|
@ -0,0 +1,44 @@
|
|||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar_Ingester_Interface_SelectItems
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Class to interface with the browser when ingesting data
|
||||
|
||||
Scholar_Ingester_Interface_SelectItems = function() {}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Public Scholar_Ingester_Interface_SelectItems methods
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
* Initialize some variables and prepare event listeners for when chrome is done
|
||||
* loading
|
||||
*/
|
||||
Scholar_Ingester_Interface_SelectItems.init = function() {
|
||||
this.documentObject = window.arguments[0];
|
||||
this.listbox = document.getElementById("scholar-selectitems-links");
|
||||
|
||||
for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to
|
||||
var itemNode = document.createElement("listitem");
|
||||
itemNode.setAttribute("type", "checkbox");
|
||||
itemNode.setAttribute("value", i);
|
||||
itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]);
|
||||
itemNode.setAttribute("checked", false);
|
||||
this.listbox.appendChild(itemNode);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
||||
// clear scrapeURLList
|
||||
this.documentObject.scrapeURLList = new Object();
|
||||
|
||||
// collect scrapeURLList from listbox
|
||||
for(var i=0; i<this.listbox.length; i++) {
|
||||
var itemNode = this.listbox[i];
|
||||
this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||
}
|
||||
}
|
23
chrome/chromeFiles/content/scholar/ingester/selectitems.xul
Executable file
23
chrome/chromeFiles/content/scholar/ingester/selectitems.xul
Executable file
|
@ -0,0 +1,23 @@
|
|||
<?xml version="1.0"?>
|
||||
<!--
|
||||
Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivion)
|
||||
-->
|
||||
<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
|
||||
<!DOCTYPE window SYSTEM "chrome://scholar/locale/scholar.dtd">
|
||||
<dialog xmlns:html="http://www.w3.org/1999/xhtml"
|
||||
xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
|
||||
title="&selectitems.title;" width="400" height="330"
|
||||
persist="width height screenX screenY"
|
||||
buttons="cancel,accept"
|
||||
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
|
||||
ondialogcancel="self.close()"
|
||||
id="scholar-selectitems"
|
||||
onload="Scholar_Ingester_Interface_SelectItems.init()">
|
||||
|
||||
<script src="../include.js"/>
|
||||
<script src="selectitems.js"/>
|
||||
<caption label="&selectitems.intro.label;" id="scholar-selectitems-intro"/>
|
||||
<box flex="1">
|
||||
<listbox id="scholar-selectitems-links" flex="1" context="scholarSelectContext"></listbox>
|
||||
</box>
|
||||
</dialog>
|
|
@ -283,7 +283,7 @@ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
|
|||
*/
|
||||
Scholar.Ingester.Utilities.prototype.cleanString = function(s) {
|
||||
s = this.trimString(s);
|
||||
return s.replace(/ +/g, " ");
|
||||
return s.replace(/[ \xA0]+/g, " ");
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -569,7 +569,15 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
|
|||
"\n})()", scraperSandbox);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in scraperDetectCode for '+currentScraper.label);
|
||||
canScrape = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
// scraperDetectCode returns an associative array (object) in the case of a search result
|
||||
if(typeof(canScrape) == "object") {
|
||||
Scholar.debug("scraperDetectCode returned a URL list");
|
||||
this.scrapeURLList = canScrape;
|
||||
} else {
|
||||
Scholar.debug("canScrape was a "+typeof(canScrape));
|
||||
}
|
||||
}
|
||||
return canScrape;
|
||||
|
|
|
@ -24,4 +24,9 @@
|
|||
<!ENTITY toolbar.newCollection.label "New Project">
|
||||
<!ENTITY toolbar.renameCollection.label "Rename Project...">
|
||||
<!ENTITY toolbar.removeCollection.label "Remove Project...">
|
||||
<!ENTITY toolbar.search.label "Search:">
|
||||
<!ENTITY toolbar.search.label "Search:">
|
||||
|
||||
<!ENTITY selectitems.title "Select Items">
|
||||
<!ENTITY selectitems.intro.label "Select which items you'd like to add to your library">
|
||||
<!ENTITY selectitems.cancel.label "Cancel">
|
||||
<!ENTITY selectitems.select.label "OK">
|
67
scrapers.sql
67
scrapers.sql
|
@ -175,24 +175,59 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
|
|||
wait();');
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-18 11:02:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
|
||||
'try {
|
||||
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||
return false;
|
||||
}
|
||||
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
||||
for(i in export_options) {
|
||||
if(export_options[i].text == ''Latin1 MARC''
|
||||
|| export_options[i].text == ''Raw MARC''
|
||||
|| export_options[i].text == ''UTF-8''
|
||||
|| export_options[i].text == ''MARC (Unicode/UTF-8)''
|
||||
|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
|
||||
return true;
|
||||
'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||
// We have search results
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
|
||||
// Do not allow text to match this
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(''\[ [0-9]+ \]'');
|
||||
|
||||
var links = doc.getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(tagRegexp.test(links[i].href)) {
|
||||
var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
|
||||
if(text) {
|
||||
text = utilities.cleanString(text);
|
||||
if(!rejectRegexp.test(text)) {
|
||||
if(availableItems[links[i].href]) {
|
||||
availableItems[links[i].href] += " "+text;
|
||||
} else {
|
||||
availableItems[links[i].href] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch(e) {
|
||||
return false;
|
||||
}',
|
||||
|
||||
if(availableItems) {
|
||||
return availableItems;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
||||
for(i in export_options) {
|
||||
if(export_options[i].text == ''Latin1 MARC''
|
||||
|| export_options[i].text == ''Raw MARC''
|
||||
|| export_options[i].text == ''UTF-8''
|
||||
|| export_options[i].text == ''MARC (Unicode/UTF-8)''
|
||||
|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
|
||||
// We have an exportable single record
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;',
|
||||
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||
|
|
Loading…
Add table
Reference in a new issue