zotero/translators/ProQuest.js

{
	"translatorID": "fce388a6-a847-4777-87fb-6595e710b7e7",
	"label": "ProQuest",
	"creator": "Avram Lyon",
	"target": "^https?://search\\.proquest\\.com[^/]*(/pqrl|/pqdt|/hnp[a-z]*)?/(docview|publication|publicationissue|results)",
	"minVersion": "2.1",
	"maxVersion": "",
	"priority": 100,
	"inRepository": true,
	"translatorType": 4,
	"lastUpdated": "2011-08-03 11:08:32"
}

/*
   ProQuest Translator
   Copyright (C) 2011 Avram Lyon, ajlyon@gmail.com

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


function detectWeb(doc, url) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;

	var record_rows = doc.evaluate('//div[@class="display_record_indexing_row"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
	if (record_rows.iterateNext()) {
		type = doc.evaluate('//div[@class="display_record_indexing_fieldname" and contains(text(),"Document Type")]/following-sibling::div[@class="display_record_indexing_data"]',
							doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
		if (type) {
			type = type.textContent.trim();
			type = mapToZotero(type);
			if (type) return type;
		}
		// Fall back on journalArticle-- even if we couldn't guess the type
		return "journalArticle";
	}

	if (url.indexOf("/results/") === -1) {
		var abstract_link = doc.evaluate('//a[@class="formats_base_sprite format_abstract"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
		if (abstract_link.iterateNext()) {
			return "journalArticle";
		}
	}
	var resultitem = doc.evaluate('//li[@class="resultItem" or contains(@class, "resultItem ")]', doc, nsResolver, XPathResult.ANY_TYPE, null);
	if (resultitem.iterateNext()) {
		return "multiple";
	}
	return false;
}

function doWeb(doc, url) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;

	var detected = detectWeb(doc,url);
	if (detected && detected != "multiple") {
		scrape(doc,url);
	} else if (detected) {
		var articles = new Array();
		var results = doc.evaluate('//li[@class="resultItem" or contains(@class, "resultItem ")]', doc, nsResolver, XPathResult.ANY_TYPE, null);
		var items = new Array();
		var result;
		while(result = results.iterateNext()) {
			var link = doc.evaluate('.//a[contains(@class,"previewTitle") or contains(@class,"resultTitle")]', result, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
			var title = link.textContent;
			var url = link.href;
			items[url] = title;
		}
		Zotero.selectItems(items, function (items) {
			if(!items) return true;
			for (var i in items) {
				articles.push(i);
			}
			Zotero.Utilities.processDocuments(articles, scrape, function () {Zotero.done();});
		});
		Zotero.wait();
	}
}

function scrape (doc) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;

	var record_rows = doc.evaluate('//div[@class="display_record_indexing_row"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
	var abstract_link = doc.evaluate('//a[@class="formats_base_sprite format_abstract"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
	if (!record_rows && abstract_link) {
		Zotero.Utilities.processDocuments(abstract_link.href, scrape, function() {Zotero.done();});
		return true;
	}
	var url = doc.location.href;

	// ProQuest provides us with two different data sources; we can pull the RIS
	// (which is nicely embedded in each page!), or we can scrape the Display Record section
	// We're going to prefer the latter, since it gives us richer data.
	// But since we have it without an additional request, we'll see about falling back on RIS for missing data

	var item = new Zotero.Item();
	var record_rows = doc.evaluate('//div[@class="display_record_indexing_row"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
	var record_row;
	item.place = [];
	item.thesisType = [];
	var account_id;
	while (record_row = record_rows.iterateNext()) {
		var field = doc.evaluate('./div[@class="display_record_indexing_fieldname"]', record_row, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
		if (!field) continue;
		field = field.textContent.trim();
		var value = doc.evaluate('./div[@class="display_record_indexing_data"]', record_row, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.trim();
		// Separate values in a single field are generally wrapped in <a> nodes; pull a list of them
		var valueAResult = doc.evaluate('./div[@class="display_record_indexing_data"]/a', record_row, nsResolver, XPathResult.ANY_TYPE, null);
		var valueA;
		var valueAArray = [];
		// We would like to get an array of the text for each <a> node
		if (valueAResult) {
			while(valueA = valueAResult.iterateNext()) {
				valueAArray.push(valueA.textContent);
			}
		}
		switch (field) {
			case "Title":
					item.title = value; break;
			case "Authors":
					item.creators = valueAArray.map(
							function(author) {
								return Zotero.Utilities.cleanAuthor(author,
									"author",
									author.indexOf(',') !== -1); // useComma
							});
					break;
			case "Publication title":
					item.publicationTitle = value; break;
			case "Volume":
					item.volume = value; break;
			case "Issue":
					item.issue = value; break;
			case "Pages":
			case "First Page":
					item.pages = value; break;
			case "Number of pages":
					item.numPages = value; break;
			case "Publication year":
			case "Year":
					item.date = (item.date) ? item.date : value; break;
			case "Publication Date":
					item.date = value; break;
			case "Publisher":
					item.publisher = value; break;
			case "Place of Publication": // TODO Change to publisher-place when schema changes
					item.place[0] = value; break;
			case "Dateline":	// TODO Change to event-place when schema changes
					item.place[0] = value; break;
			case "School location":	// TODO Change to publisher-place when schema changes
					item.place[0] = value; break;
			// blacklisting country-- ProQuest regularly gives us Moscow, United States
			//case "Country of publication":
			//		item.place[1] = value; break;
			case "ISSN":
					item.ISSN = value; break;
			case "ISBN":
					item.ISBN = value; break;
			case "DOI":
					item.DOI = value; break;
			case "School":
					item.university = value; break;
			case "Degree":
					item.thesisType[0] = value; break;
			case "Department":
					item.thesisType[1] = value; break;
			case "Advisor":		// TODO Map when exists in Zotero
					break;
			case "Source type":
			case "Document Type":
					item.itemType = (mapToZotero(value)) ? mapToZotero(value) : item.itemType; break;
			case "Copyright":
					item.rights = value; break;
			case "Database":
					value = value.replace(/^\d\s+databasesView list\s+Hide list/,'');
					value = value.replace(/(ProQuest.*)(ProQuest.*)/,'$1; $2');
					item.libraryCatalog = value; break;
			case "Document URL":
					item.attachments.push({url:value.replace(/\?accountid=[0-9]+$/,'')+"/abstract",
								title: "ProQuest Record",
								mimeType: "text/html"}); break;
			case "ProQuest Document ID":
					item.callNumber = value; break;
			case "Language of Publication":
					item.language = value; break;
			case "Section":
					item.section = value; break;
			case "Identifiers / Keywords":
					item.tags = value.split(', '); break;
			case "Subjects":
					item.tags = valueAArray; break;
			default: Zotero.debug("Discarding unknown field '"+field+"' => '" +value+ "'");
		}
	}

	var abs = doc.evaluate('//div[@id="abstract_field" or @id="abstractSummary"]/p', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
	if (abs) {
		item.abstractNote = abs.textContent
					.replace(/\[\s*[Ss]how all\s*\].*/,"")
					.replace(/\[\s*[Ss]how less\s*\].*/,"")
					.replace(/\[\s*PUBLICATION ABSTRACT\s*\]/,"")
					.trim();
	}

	item.place = item.place.join(', ');
	item.thesisType = item.thesisType.join(', ');

	item.proceedingsTitle = item.publicationTitle;

	// On historical newspapers, we see:
	// Rights: Copyright New York Times Company Dec 1, 1852
	// Date: 1852
	// We can improve on this, so we do.
	var fullerDate = item.rights.match(/([A-Z][a-z]{2} \d{1,2}, \d{4}$)/);
	if (!item.date ||
		(item.date.match(/^\d{4}$/) && fullerDate)) {
		item.date = fullerDate[1];
	}

	if (!item.itemType && item.libraryCatalog && item.libraryCatalog.match(/Historical Newspapers/))
		item.itemType = "newspaperArticle";

	if(!item.itemType) item.itemType="journalArticle";

	// Ok, now we'll pull the RIS and run it through the translator. And merge with the temporary item.
	// RIS LOGIC GOES HERE

	// Sometimes the PDF is right on this page
	var realLink = doc.evaluate('//div[@id="pdffailure"]/div[@class="body"]/a', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
	if (realLink) {
		item.attachments.push({url:realLink.href,
								title:"ProQuest PDF",
								mimeType:"application/pdf"});
		item.complete();
	} else {
		// The PDF link requires two requests-- we fetch the PDF full text page
		var pdf = doc.evaluate('//a[@class="formats_base_sprite format_pdf"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
		if (pdf) {
			Zotero.Utilities.processDocuments(pdf.href, function(pdfDoc){
				// This page gives a beautiful link directly to the PDF, right in the HTML
				realLink = pdfDoc.evaluate('//div[@id="pdffailure"]/div[@class="body"]/a', pdfDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
				if (realLink) {
					item.attachments.push({url:realLink.href,
											title:"ProQuest PDF",
											mimeType:"application/pdf"});
				}
				item.complete();
			}, function () {});
		} else {
			item.complete();
		}
	}
}

// This map is not complete. See debug output to catch unassigned types
function mapToZotero (type) {
	var map = {
	"Scholarly Journals" : "journalArticle",
	"Book Review-Mixed" : false, // FIX AS NECESSARY
	"Reports" : "report",
	"REPORT" : "report",
	"Historical Newspapers" : "newspaperArticle",
	"Newspapers" : "newspaperArticle",
	//"News" : "newspaperArticle",	// Otherwise Foreign Policy is treated as a newspaper http://search.proquest.com/docview/840433348
	"Magazines" : "magazineArticle",
	"Dissertations & Theses" : "thesis",
	"Dissertation/Thesis" : "thesis",
	"Conference Papers & Proceedings" : "conferencePaper",
	"Wire Feeds": "newspaperArticle", // Good enough?
	"WIRE FEED": "newspaperArticle" // Good enough?
	}
	if (map[type]) return map[type];
	Zotero.debug("No mapping for type: "+type);
	return false;
}
/** BEGIN TEST CASES **/
var testCases = [
	{
		"type": "web",
		"url": "http://search.proquest.com/docview/213445241",
		"items": [
			{
				"itemType": "journalArticle",
				"creators": [
					{
						"firstName": "Gerald F",
						"lastName": "Powers",
						"creatorType": "author"
					},
					{
						"firstName": "Drew",
						"lastName": "Christiansen",
						"creatorType": "author"
					},
					{
						"firstName": "Robert T",
						"lastName": "Hennemeyer",
						"creatorType": "author"
					}
				],
				"notes": [],
				"tags": [
					"Peace",
					"Book reviews"
				],
				"seeAlso": [],
				"attachments": [
					{
						"url": false,
						"title": "ProQuest Record",
						"mimeType": "text/html"
					}
				],
				"place": "Winnipeg",
				"title": "Peacemaking: moral & policy challenges for a new world // Review",
				"publicationTitle": "Peace Research",
				"volume": "27",
				"issue": "2",
				"pages": "90-100",
				"numPages": "0",
				"date": "May 1995",
				"publisher": "Menno Simons College",
				"ISSN": "00084697",
				"language": "English",
				"callNumber": "213445241",
				"rights": "Copyright Peace Research May 1995",
				"proceedingsTitle": "Peace Research",
				"libraryCatalog": "ProQuest",
				"shortTitle": "Peacemaking"
			}
		]
	}
]
/** END TEST CASES **/