434 lines
No EOL
15 KiB
JavaScript
434 lines
No EOL
15 KiB
JavaScript
{
|
|
"translatorID":"53f8d182-4edc-4eab-b5a1-141698a20202",
|
|
"label":"TalisPrism",
|
|
"creator":"William Smith and Emma Reisz",
|
|
"target":"/TalisPrism/(browseResults|doSearch)",
|
|
"minVersion":"1.0.0b4.r5",
|
|
"maxVersion":"",
|
|
"priority":100,
|
|
"inRepository":"1",
|
|
"translatorType":4,
|
|
"lastUpdated":"2010-11-15 11:35:54"
|
|
}
|
|
|
|
/* TalisPrism translator.
|
|
Version 1.1
|
|
By William Smith (http://www.willsmith.org/contactme)
|
|
and Emma Reisz
|
|
|
|
TalisPrism is a library management system used by a number of universities
|
|
and public bodies in the UK, Ireland and elsewhere.
|
|
For example: http://qu-prism.qub.ac.uk/TalisPrism/
|
|
and http://http://star.shef.ac.uk/TalisPrism/
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
// TalisPrism doesn't use metadata so everything must be scraped.
|
|
|
|
function detectWeb(doc, url){
|
|
|
|
/* Can't differentiate multiple from single results by URL
|
|
as single search results have a search URL but display as browse.
|
|
Can't scrape the titles to differentiate between single and multiple as the display format
|
|
is too different to be scraped consistently.
|
|
Instead we differentiate by URL but make an exception for a solo result.
|
|
*/
|
|
var search=searchTest(doc, url);
|
|
|
|
if (search==1) {
|
|
var doctype = 'multiple';
|
|
} else {doctype=docType(doc, url);
|
|
}
|
|
return doctype;
|
|
}
|
|
|
|
function docType (doc,url){
|
|
//Need xpaths to detect type.
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == "x" ) return namespace; else return null;
|
|
} : null;
|
|
|
|
// Best way to identify item type on an entry page is by its icon.
|
|
if (getXPath(doc, '//img[@alt="sound - disc"]/@alt').length) {
|
|
doctype = 'audioRecording';
|
|
} else if (getXPath(doc, '//img[@alt="Book"]/@alt').length) {
|
|
doctype = 'book';
|
|
} else if (getXPath(doc, '//img[@alt="video - disc"]/@alt').length) {
|
|
doctype = 'videoRecording';
|
|
} else {
|
|
doctype = 'document';
|
|
}
|
|
return doctype;
|
|
}
|
|
|
|
|
|
function searchTest (doc, url){
|
|
|
|
//Need xpaths to differentiate search and item pages.
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == "x" ) return namespace; else return null;
|
|
} : null;
|
|
|
|
var searchPage;
|
|
var search;
|
|
if (url.match(/doSearch/)) {
|
|
var resultCount;
|
|
var resultCountElements = new Array();
|
|
var resultCountText;
|
|
var resultCountPath = '//table/tbody/tr/td/table/tbody/tr/td[1]/font/span[@class="text"]/font';
|
|
var resultCountObject = doc.evaluate(resultCountPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (resultCountText = resultCountObject.iterateNext()) {
|
|
resultCountElements.push(resultCountText.textContent);
|
|
}
|
|
resultCount=resultCountElements[0];
|
|
if (resultCount == 1) {
|
|
search=0;
|
|
} else {
|
|
search=1;
|
|
}
|
|
} else {
|
|
var pageCount;
|
|
var pageCountElements = new Array();
|
|
var pageCountText;
|
|
var pageCountPath= '//tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr[2]/td/font/span[@class="text"]/table/tbody/tr/td[4]';
|
|
var pageCountObject = doc.evaluate(pageCountPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (pageCountText = pageCountObject.iterateNext()) {
|
|
pageCountElements.push(pageCountText.textContent);
|
|
}
|
|
pageCount=pageCountElements[0];
|
|
if (pageCount==undefined){
|
|
search=0;
|
|
} else if (pageCount.match(/Page/)){
|
|
search=1
|
|
} else {
|
|
search=0;
|
|
}
|
|
}
|
|
return search;
|
|
}
|
|
|
|
function getXPath ( doc, field ) {
|
|
xpath = field;
|
|
|
|
content = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
|
|
|
|
if (content)
|
|
return content.textContent;
|
|
else
|
|
return '';
|
|
|
|
}
|
|
|
|
//TalisPrism displays with labels. The getField function searches for the next different field after a label.
|
|
|
|
function getField (doc, field) {
|
|
|
|
xpath='//span[@class="text"]';
|
|
|
|
content = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
|
|
|
|
while (c = content.iterateNext())
|
|
{
|
|
if (c.textContent == field)
|
|
{
|
|
// OK, find the next field
|
|
while (val = content.iterateNext()) {
|
|
|
|
if (val && val.textContent != c.textContent)
|
|
{
|
|
return val.textContent;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return '';
|
|
}
|
|
|
|
function multiscrape(doc, url) {
|
|
url=doc.documentURI;
|
|
var item;
|
|
var doctype = docType(doc, url);
|
|
item = new Zotero.Item(doctype);
|
|
scrape(doc,url, item);
|
|
}
|
|
|
|
|
|
function soloscrape(doc, url) {
|
|
url=doc.documentURI;
|
|
var item;
|
|
item = new Zotero.Item(doctype);
|
|
scrape(doc,url, item);
|
|
return '';
|
|
}
|
|
|
|
|
|
function scrape(doc, url, item){
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == 'x') return namespace; else return null;
|
|
} : null;
|
|
|
|
// The fields often contain multiple data types and need some cleanup.
|
|
var title = getField(doc, 'Title');
|
|
|
|
if (title.length == 0) {
|
|
title = 'Unknown Title';
|
|
}
|
|
// If title includes a forward slash, omit the last bit.
|
|
if (title.match('/')) {
|
|
title = title.substring(0, title.lastIndexOf('/'));
|
|
}
|
|
title = title.replace(/^\s+|\s+$/g, '');
|
|
item.title = title;
|
|
|
|
var author = getField(doc, 'Author');
|
|
if (author.length) {
|
|
item.creators.push(Zotero.Utilities.cleanAuthor(author, "author", 1));
|
|
} else {
|
|
author = getField(doc, 'Other Author(s) / Title(s)');
|
|
if (author.length) {
|
|
item.creators.push(Zotero.Utilities.cleanAuthor(author, "author", 1));
|
|
}
|
|
}
|
|
|
|
|
|
// Place, publisher and publish date are in the same field. Format is usually "Place : Publisher, yyyy".
|
|
|
|
var publishing = getField(doc, 'Publisher');
|
|
if (publishing.length == 0) {
|
|
publishing = getField(doc, 'Published');
|
|
}
|
|
if (publishing.length == 0) {
|
|
publishing = getField(doc, 'Publication details');
|
|
}
|
|
|
|
if (publishing.match(/(13|14|15|16|17|18|19|20)\d\d/)) {
|
|
var pos = publishing.search(/(13|14|15|16|17|18|19|20)\d\d/);
|
|
item.date = publishing.substring(pos, publishing.lastIndexOf('.')).match(/\d\d\d\d/);
|
|
var place = publishing.substring(0, publishing.indexOf(':'));
|
|
item.place = place.replace(/^\s+|\s+$/g, '');
|
|
var publisher = publishing.substring(publishing.indexOf(':')+1, pos);
|
|
item.publisher = publisher.replace(/^\s+|\s+$|\,\s+$/g, '');
|
|
}
|
|
|
|
|
|
var isbn = getField(doc, 'ISBN');
|
|
if (isbn.length == 0) {
|
|
isbn = getField(doc, 'ISBN, etc.');
|
|
}
|
|
|
|
isbn=isbn.replace(/^\D+|\D+$/g, "");
|
|
item.ISBN = isbn.substring(0).match(/\d+/);
|
|
|
|
var series = getField(doc, 'Series');
|
|
var pos2 =series.lastIndexOf(';');
|
|
if (pos2==-1){
|
|
item.series=series.replace(/^\s+|\s+$/g, '');
|
|
}else{
|
|
var seriesName = series.substring(0, pos2);
|
|
item.series = seriesName.replace(/^\s+|\s+$/g, '');
|
|
var seriesNumber = series.substring(pos2+1);
|
|
item.seriesNumber = seriesNumber.replace(/^\s+|\s+$/g, '');
|
|
}
|
|
|
|
item.edition = getField(doc, 'Edition');
|
|
|
|
var physical = getField(doc, 'Physical details');
|
|
var numPages = physical.substring(0, physical.indexOf(':'));
|
|
item.numPages = numPages.replace(/^\s+|\s+$/g, '');
|
|
|
|
var physicaldetails = physical.substring(physical.indexOf(':')+1, physical.lastIndexOf('.'));
|
|
physicaldetails = physicaldetails.replace(/^\s+|\s+$/g, '');
|
|
|
|
var databasedetails = getField(doc, 'Cited/indexed in');
|
|
databasedetails = databasedetails.replace(/^\s+|\s+$/g, '');
|
|
|
|
item.extra = databasedetails + physicaldetails
|
|
|
|
item.attachments.push({url:url, title:"Snapshot of Library Page", mimeType:"text/html"});
|
|
|
|
var doctitle
|
|
doctitle = doc.title
|
|
if (doctitle == "TalisPrism"){
|
|
item.libraryCatalog =url.substring(url.indexOf('http'), url.indexOf('/TalisPrism'));
|
|
} else {
|
|
item.libraryCatalog = doctitle
|
|
}
|
|
|
|
|
|
/* We need to XPath to the call number as we cannot be sure about the previous cell,
|
|
so the label method won't work. Some items have multiple call numbers,
|
|
but a generalised XPath which retrieves multiple sets of location data (tr[2], tr[3] etc.)
|
|
also retrieves tr [1], which contains all the rest of the bibliographic entry.
|
|
The size of tr[1] varies and there is no consistent final item,
|
|
so instead of using a general XPath, we scrape tr[2], tr[3] and tr[4] successively into an array;
|
|
tr[5] is also scraped into the array, but if non-null, 'See record for additional call numbers.'
|
|
is returned as the final shelfmark. Note that each call number is itself scraped into an
|
|
array ('shelfmarkElements'), as we need both the Library and Shelfmark elements.
|
|
*/
|
|
|
|
var shelfmark = new Array();
|
|
var callNumber = "";
|
|
|
|
//Need to test whether the search page has a sidebar showing as this shifts the classmarks.
|
|
|
|
var authorModePath='//td/table/tbody/tr/td[1]/font/span[@class="text"]/table/tbody/tr[2]/td/font/span[@class="text"]/font/b/span[@class="text"]/table/tbody/tr/td[2]';
|
|
var authorModeObject=doc.evaluate(authorModePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
|
var browseModePath='//td/table/tbody/tr/td[1]/font/span[@class="text"]/table/tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr/td[1]';
|
|
var browseModeObject=doc.evaluate(browseModePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
|
var shelfmarkPath = new Array();
|
|
shelfmarkPath[0] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[2]/td';
|
|
shelfmarkPath[1] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[3]/td';
|
|
shelfmarkPath[2] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[4]/td';
|
|
shelfmarkPath[3] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[5]/td';
|
|
var shelfmarkText;
|
|
if (authorModeObject==null||authorModeObject.innerHTML==null){
|
|
if (browseModeObject==null||browseModeObject.innerHTML==null){
|
|
for (var i=0; i < 4; i ++){
|
|
var shelfmarkObject = new Array();
|
|
var shelfmarkElements = new Array();
|
|
shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (shelfmarkText = shelfmarkObject[i].iterateNext()) {
|
|
shelfmarkElements.push(shelfmarkText.textContent);
|
|
}
|
|
shelfmark[i]=shelfmarkElements[0]+" "+shelfmarkElements[1];
|
|
//Need to remove junk text scraped when there is a request button in the call number field.
|
|
shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, '');
|
|
}
|
|
} else if (browseModeObject.innerHTML.match(/arrow/)) {
|
|
for (var i=0; i < 4; i ++){
|
|
var shelfmarkObject = new Array();
|
|
var shelfmarkElements = new Array();
|
|
shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (shelfmarkText = shelfmarkObject[i].iterateNext()) {
|
|
shelfmarkElements.push(shelfmarkText.textContent);
|
|
}
|
|
shelfmark[i]=shelfmarkElements[1]+" "+shelfmarkElements[2];
|
|
shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, '');
|
|
}
|
|
}
|
|
}else if (authorModeObject.innerHTML.match(/arrow/)){
|
|
for (var i=0; i < 4; i ++){
|
|
var shelfmarkObject = new Array();
|
|
var shelfmarkElements = new Array();
|
|
shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (shelfmarkText = shelfmarkObject[i].iterateNext()) {
|
|
shelfmarkElements.push(shelfmarkText.textContent);
|
|
}
|
|
shelfmark[i]=shelfmarkElements[1]+" "+shelfmarkElements[2];
|
|
shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, '');
|
|
}
|
|
}
|
|
if (shelfmark[0] != "undefined undefined"){
|
|
callNumber = shelfmark[0];
|
|
}
|
|
for (var i=1; i<3; i++){
|
|
if (shelfmark[i] != "undefined undefined"){
|
|
callNumber = callNumber + "; " + shelfmark[i];
|
|
}
|
|
}
|
|
if (shelfmark[3] != "undefined undefined"){
|
|
callNumber = callNumber + ". See record for additional call numbers.";
|
|
}
|
|
|
|
item.callNumber = callNumber;
|
|
|
|
var link = getField (doc, 'Link to');
|
|
if (link.length == 0) {
|
|
var linkPath='//span[@class="text"]/table/tbody/tr/td/table/tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[2]/font/span[@class="text"]/a';
|
|
var linkObject=doc.evaluate(linkPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
|
if (linkObject==null){
|
|
} else {
|
|
var linkTitle=linkObject.textContent;
|
|
var linkLink=linkObject.href;
|
|
if (linkTitle=="Link to electronic text"){
|
|
link=linkLink;
|
|
}
|
|
}
|
|
}
|
|
item.url = link;
|
|
|
|
item.complete();
|
|
return '';
|
|
}
|
|
|
|
function doWeb(doc, url) {
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == "x" ) return namespace; else return null;
|
|
} : null;
|
|
|
|
var articles = new Array ();
|
|
var names = new Array ();
|
|
var items = new Object ();
|
|
var nextTitle;
|
|
doctype=detectWeb(doc, url);
|
|
|
|
/* Typically scrapers process both search pages and item pages in the same way;
|
|
the processDocuments function is used, calling the scraped result link URLs for a search page,
|
|
and for an item page calling the item page's own URL.
|
|
But Talis displays solo search results with an unstable URL and with no link to an item page.
|
|
So we cannot call the URL for a solo search result as it will yield a null page.
|
|
Instead we must process solo search results directly without using processDocuments.
|
|
We want to process item pages in the same way as solo search pages because
|
|
waiting for the URL on an item page to be called noticeably slows down the scrape.
|
|
*/
|
|
|
|
var indexPath ='//span[@class="text"]/x:table/x:tbody/x:tr/x:td/x:table/x:tbody/x:tr/x:td[1]'
|
|
var index;
|
|
var indexElements = new Array();
|
|
var indexText;
|
|
var indexObject = doc.evaluate(indexPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (indexText = indexObject.iterateNext()) {
|
|
indexElements.push(indexText.textContent);
|
|
}
|
|
index=indexElements[0];
|
|
index1=indexElements[1];
|
|
if (doctype == "multiple" && index.match(/Index/) && index1 == ""){
|
|
var titlePath = '//td[3]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[1]/font/span[@class="text"]/a';
|
|
var titles = doc.evaluate(titlePath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (nextTitle = titles.iterateNext()) {
|
|
items[nextTitle.href] = nextTitle.textContent;
|
|
names.push(nextTitle.textContent);
|
|
}
|
|
items = Zotero.selectItems(items);
|
|
for (var i in items) {
|
|
articles.push(i);
|
|
}
|
|
Zotero.Utilities.processDocuments(articles, multiscrape, function(){Zotero.done();});
|
|
|
|
} else if (doctype == "multiple") {
|
|
var titlePath = '//td[4]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[1]/font/span[@class="text"]/a';
|
|
var titles = doc.evaluate(titlePath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
|
while (nextTitle = titles.iterateNext()) {
|
|
items[nextTitle.href] = nextTitle.textContent;
|
|
names.push(nextTitle.textContent);
|
|
}
|
|
items = Zotero.selectItems(items);
|
|
for (var i in items) {
|
|
articles.push(i);
|
|
}
|
|
Zotero.Utilities.processDocuments(articles, multiscrape, function(){Zotero.done();});
|
|
}
|
|
else {
|
|
soloscrape(doc, url);
|
|
}
|
|
Zotero.wait();
|
|
|
|
} |