2011-03-08 17:19:40 +00:00
{
2011-08-11 07:01:19 +00:00
"translatorID" : "fce388a6-a847-4777-87fb-6595e710b7e7" ,
"label" : "ProQuest" ,
"creator" : "Avram Lyon" ,
"target" : "^https?://search\\.proquest\\.com[^/]*(/pqrl|/pqdt|/hnp[a-z]*)?/(docview|publication|publicationissue|results)" ,
"minVersion" : "2.1" ,
"maxVersion" : "" ,
"priority" : 100 ,
"inRepository" : true ,
"translatorType" : 4 ,
2011-08-18 05:07:32 +00:00
"lastUpdated" : "2011-08-03 11:08:32"
2011-03-08 17:19:40 +00:00
}
/ *
ProQuest Translator
Copyright ( C ) 2011 Avram Lyon , ajlyon @ gmail . com
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http : //www.gnu.org/licenses/>.
* /
function detectWeb ( doc , url ) {
var namespace = doc . documentElement . namespaceURI ;
var nsResolver = namespace ? function ( prefix ) {
if ( prefix == 'x' ) return namespace ; else return null ;
} : null ;
var record _rows = doc . evaluate ( '//div[@class="display_record_indexing_row"]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) ;
if ( record _rows . iterateNext ( ) ) {
2011-08-11 07:01:19 +00:00
type = doc . evaluate ( '//div[@class="display_record_indexing_fieldname" and contains(text(),"Document Type")]/following-sibling::div[@class="display_record_indexing_data"]' ,
doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
if ( type ) {
type = type . textContent . trim ( ) ;
type = mapToZotero ( type ) ;
if ( type ) return type ;
}
// Fall back on journalArticle-- even if we couldn't guess the type
return "journalArticle" ;
}
if ( url . indexOf ( "/results/" ) === - 1 ) {
var abstract _link = doc . evaluate ( '//a[@class="formats_base_sprite format_abstract"]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) ;
if ( abstract _link . iterateNext ( ) ) {
return "journalArticle" ;
}
2011-03-08 17:19:40 +00:00
}
2011-08-11 07:01:19 +00:00
var resultitem = doc . evaluate ( '//li[@class="resultItem" or contains(@class, "resultItem ")]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) ;
2011-03-08 17:19:40 +00:00
if ( resultitem . iterateNext ( ) ) {
return "multiple" ;
}
return false ;
}
function doWeb ( doc , url ) {
var namespace = doc . documentElement . namespaceURI ;
var nsResolver = namespace ? function ( prefix ) {
if ( prefix == 'x' ) return namespace ; else return null ;
} : null ;
var detected = detectWeb ( doc , url ) ;
if ( detected && detected != "multiple" ) {
scrape ( doc , url ) ;
} else if ( detected ) {
var articles = new Array ( ) ;
2011-08-11 07:01:19 +00:00
var results = doc . evaluate ( '//li[@class="resultItem" or contains(@class, "resultItem ")]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) ;
2011-03-08 17:19:40 +00:00
var items = new Array ( ) ;
var result ;
while ( result = results . iterateNext ( ) ) {
var link = doc . evaluate ( './/a[contains(@class,"previewTitle") or contains(@class,"resultTitle")]' , result , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
var title = link . textContent ;
var url = link . href ;
items [ url ] = title ;
}
2011-08-11 07:01:19 +00:00
Zotero . selectItems ( items , function ( items ) {
if ( ! items ) return true ;
for ( var i in items ) {
articles . push ( i ) ;
}
Zotero . Utilities . processDocuments ( articles , scrape , function ( ) { Zotero . done ( ) ; } ) ;
} ) ;
2011-03-08 17:19:40 +00:00
Zotero . wait ( ) ;
}
}
function scrape ( doc ) {
var namespace = doc . documentElement . namespaceURI ;
var nsResolver = namespace ? function ( prefix ) {
if ( prefix == 'x' ) return namespace ; else return null ;
} : null ;
2011-08-11 07:01:19 +00:00
var record _rows = doc . evaluate ( '//div[@class="display_record_indexing_row"]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
var abstract _link = doc . evaluate ( '//a[@class="formats_base_sprite format_abstract"]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
if ( ! record _rows && abstract _link ) {
2011-08-18 05:07:32 +00:00
Zotero . Utilities . processDocuments ( abstract _link . href , scrape , function ( ) { Zotero . done ( ) ; } ) ;
return true ;
2011-08-11 07:01:19 +00:00
}
var url = doc . location . href ;
2011-03-08 17:19:40 +00:00
// ProQuest provides us with two different data sources; we can pull the RIS
// (which is nicely embedded in each page!), or we can scrape the Display Record section
// We're going to prefer the latter, since it gives us richer data.
// But since we have it without an additional request, we'll see about falling back on RIS for missing data
var item = new Zotero . Item ( ) ;
var record _rows = doc . evaluate ( '//div[@class="display_record_indexing_row"]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) ;
var record _row ;
item . place = [ ] ;
item . thesisType = [ ] ;
var account _id ;
while ( record _row = record _rows . iterateNext ( ) ) {
2011-08-11 07:01:19 +00:00
var field = doc . evaluate ( './div[@class="display_record_indexing_fieldname"]' , record _row , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( )
if ( ! field ) continue ;
field = field . textContent . trim ( ) ;
2011-03-08 17:19:40 +00:00
var value = doc . evaluate ( './div[@class="display_record_indexing_data"]' , record _row , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) . textContent . trim ( ) ;
// Separate values in a single field are generally wrapped in <a> nodes; pull a list of them
var valueAResult = doc . evaluate ( './div[@class="display_record_indexing_data"]/a' , record _row , nsResolver , XPathResult . ANY _TYPE , null ) ;
var valueA ;
var valueAArray = [ ] ;
// We would like to get an array of the text for each <a> node
if ( valueAResult ) {
while ( valueA = valueAResult . iterateNext ( ) ) {
valueAArray . push ( valueA . textContent ) ;
}
}
switch ( field ) {
case "Title" :
item . title = value ; break ;
case "Authors" :
item . creators = valueAArray . map (
function ( author ) {
return Zotero . Utilities . cleanAuthor ( author ,
"author" ,
author . indexOf ( ',' ) !== - 1 ) ; // useComma
} ) ;
break ;
case "Publication title" :
item . publicationTitle = value ; break ;
case "Volume" :
item . volume = value ; break ;
case "Issue" :
item . issue = value ; break ;
case "Pages" :
case "First Page" :
item . pages = value ; break ;
case "Number of pages" :
item . numPages = value ; break ;
case "Publication year" :
case "Year" :
item . date = ( item . date ) ? item . date : value ; break ;
case "Publication Date" :
item . date = value ; break ;
case "Publisher" :
item . publisher = value ; break ;
case "Place of Publication" : // TODO Change to publisher-place when schema changes
item . place [ 0 ] = value ; break ;
case "Dateline" : // TODO Change to event-place when schema changes
item . place [ 0 ] = value ; break ;
case "School location" : // TODO Change to publisher-place when schema changes
item . place [ 0 ] = value ; break ;
// blacklisting country-- ProQuest regularly gives us Moscow, United States
//case "Country of publication":
// item.place[1] = value; break;
case "ISSN" :
item . ISSN = value ; break ;
case "ISBN" :
item . ISBN = value ; break ;
case "DOI" :
item . DOI = value ; break ;
case "School" :
item . university = value ; break ;
case "Degree" :
item . thesisType [ 0 ] = value ; break ;
case "Department" :
item . thesisType [ 1 ] = value ; break ;
case "Advisor" : // TODO Map when exists in Zotero
break ;
case "Source type" :
case "Document Type" :
item . itemType = ( mapToZotero ( value ) ) ? mapToZotero ( value ) : item . itemType ; break ;
case "Copyright" :
item . rights = value ; break ;
case "Database" :
2011-08-18 05:07:32 +00:00
value = value . replace ( /^\d\s+databasesView list\s+Hide list/ , '' ) ;
value = value . replace ( /(ProQuest.*)(ProQuest.*)/ , '$1; $2' ) ;
2011-03-08 17:19:40 +00:00
item . libraryCatalog = value ; break ;
2011-08-11 07:01:19 +00:00
case "Document URL" :
2011-08-18 05:07:32 +00:00
item . attachments . push ( { url : value . replace ( /\?accountid=[0-9]+$/ , '' ) + "/abstract" ,
2011-08-11 07:01:19 +00:00
title : "ProQuest Record" ,
mimeType : "text/html" } ) ; break ;
2011-08-18 05:07:32 +00:00
case "ProQuest Document ID" :
item . callNumber = value ; break ;
2011-03-08 17:19:40 +00:00
case "Language of Publication" :
item . language = value ; break ;
case "Section" :
item . section = value ; break ;
case "Identifiers / Keywords" :
item . tags = value . split ( ', ' ) ; break ;
case "Subjects" :
item . tags = valueAArray ; break ;
default : Zotero . debug ( "Discarding unknown field '" + field + "' => '" + value + "'" ) ;
}
}
2011-08-11 07:01:19 +00:00
var abs = doc . evaluate ( '//div[@id="abstract_field" or @id="abstractSummary"]/p' , doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
2011-03-08 17:19:40 +00:00
if ( abs ) {
item . abstractNote = abs . textContent
2011-08-11 07:01:19 +00:00
. replace ( /\[\s*[Ss]how all\s*\].*/ , "" )
. replace ( /\[\s*[Ss]how less\s*\].*/ , "" )
2011-03-08 17:19:40 +00:00
. replace ( /\[\s*PUBLICATION ABSTRACT\s*\]/ , "" )
. trim ( ) ;
}
2011-08-11 07:01:19 +00:00
item . place = item . place . join ( ', ' ) ;
item . thesisType = item . thesisType . join ( ', ' ) ;
item . proceedingsTitle = item . publicationTitle ;
// On historical newspapers, we see:
// Rights: Copyright New York Times Company Dec 1, 1852
// Date: 1852
// We can improve on this, so we do.
var fullerDate = item . rights . match ( /([A-Z][a-z]{2} \d{1,2}, \d{4}$)/ ) ;
if ( ! item . date ||
( item . date . match ( /^\d{4}$/ ) && fullerDate ) ) {
item . date = fullerDate [ 1 ] ;
}
if ( ! item . itemType && item . libraryCatalog && item . libraryCatalog . match ( /Historical Newspapers/ ) )
item . itemType = "newspaperArticle" ;
if ( ! item . itemType ) item . itemType = "journalArticle" ;
2011-03-08 17:19:40 +00:00
// Ok, now we'll pull the RIS and run it through the translator. And merge with the temporary item.
// RIS LOGIC GOES HERE
// Sometimes the PDF is right on this page
var realLink = doc . evaluate ( '//div[@id="pdffailure"]/div[@class="body"]/a' , doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
if ( realLink ) {
2011-08-11 07:01:19 +00:00
item . attachments . push ( { url : realLink . href ,
title : "ProQuest PDF" ,
mimeType : "application/pdf" } ) ;
item . complete ( ) ;
2011-03-08 17:19:40 +00:00
} else {
// The PDF link requires two requests-- we fetch the PDF full text page
var pdf = doc . evaluate ( '//a[@class="formats_base_sprite format_pdf"]' , doc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
if ( pdf ) {
2011-08-11 07:01:19 +00:00
Zotero . Utilities . processDocuments ( pdf . href , function ( pdfDoc ) {
// This page gives a beautiful link directly to the PDF, right in the HTML
realLink = pdfDoc . evaluate ( '//div[@id="pdffailure"]/div[@class="body"]/a' , pdfDoc , nsResolver , XPathResult . ANY _TYPE , null ) . iterateNext ( ) ;
if ( realLink ) {
item . attachments . push ( { url : realLink . href ,
title : "ProQuest PDF" ,
mimeType : "application/pdf" } ) ;
}
item . complete ( ) ;
} , function ( ) { } ) ;
2011-03-08 17:19:40 +00:00
} else {
2011-08-11 07:01:19 +00:00
item . complete ( ) ;
2011-03-08 17:19:40 +00:00
}
}
}
// This map is not complete. See debug output to catch unassigned types
function mapToZotero ( type ) {
var map = {
"Scholarly Journals" : "journalArticle" ,
"Book Review-Mixed" : false , // FIX AS NECESSARY
"Reports" : "report" ,
"REPORT" : "report" ,
2011-08-11 07:01:19 +00:00
"Historical Newspapers" : "newspaperArticle" ,
2011-03-08 17:19:40 +00:00
"Newspapers" : "newspaperArticle" ,
//"News" : "newspaperArticle", // Otherwise Foreign Policy is treated as a newspaper http://search.proquest.com/docview/840433348
"Magazines" : "magazineArticle" ,
"Dissertations & Theses" : "thesis" ,
"Dissertation/Thesis" : "thesis" ,
"Conference Papers & Proceedings" : "conferencePaper" ,
"Wire Feeds" : "newspaperArticle" , // Good enough?
"WIRE FEED" : "newspaperArticle" // Good enough?
}
if ( map [ type ] ) return map [ type ] ;
Zotero . debug ( "No mapping for type: " + type ) ;
return false ;
}
2011-08-11 07:01:19 +00:00
/** BEGIN TEST CASES **/
var testCases = [
{
"type" : "web" ,
"url" : "http://search.proquest.com/docview/213445241" ,
"items" : [
{
"itemType" : "journalArticle" ,
"creators" : [
{
"firstName" : "Gerald F" ,
"lastName" : "Powers" ,
"creatorType" : "author"
} ,
{
"firstName" : "Drew" ,
"lastName" : "Christiansen" ,
"creatorType" : "author"
} ,
{
"firstName" : "Robert T" ,
"lastName" : "Hennemeyer" ,
"creatorType" : "author"
}
] ,
"notes" : [ ] ,
"tags" : [
"Peace" ,
"Book reviews"
] ,
"seeAlso" : [ ] ,
"attachments" : [
{
"url" : false ,
"title" : "ProQuest Record" ,
"mimeType" : "text/html"
}
] ,
"place" : "Winnipeg" ,
"title" : "Peacemaking: moral & policy challenges for a new world // Review" ,
"publicationTitle" : "Peace Research" ,
"volume" : "27" ,
"issue" : "2" ,
"pages" : "90-100" ,
"numPages" : "0" ,
"date" : "May 1995" ,
"publisher" : "Menno Simons College" ,
"ISSN" : "00084697" ,
"language" : "English" ,
2011-08-18 05:07:32 +00:00
"callNumber" : "213445241" ,
2011-08-11 07:01:19 +00:00
"rights" : "Copyright Peace Research May 1995" ,
"proceedingsTitle" : "Peace Research" ,
"libraryCatalog" : "ProQuest" ,
"shortTitle" : "Peacemaking"
}
]
}
]
/** END TEST CASES **/