#!/usr/bin/env python2 # json2catalog.py - create a "catalog" from a set of HathiTrust json files # Eric Lease Morgan # May 18, 2015 - first cut; see https://sharc.hathitrust.org/features # configure HEADER = "id\ttitle\tpublication date\tpage count\tHathiTrust URL\tlanguage\tMARC (JSON) URL\tWorldCat URL" WORLDCAT = 'http://worldcat.org/oclc/' # require import glob import json import sys import os # sanity check if len( sys.argv ) != 2 : print "Usage:", sys.argv[ 0 ], '' quit() # get input directory = sys.argv[ 1 ] # intialize print( HEADER ) # process each json file in the given directory for filename in glob.glob( directory + '*.json' ): # open and read the file with open( filename ) as data: metadata = json.load( data ) # parse id = metadata[ 'id' ] title = metadata[ 'metadata' ]['title' ] date_created = metadata[ 'metadata' ][ 'dateCreated' ] page_count = metadata[ 'features' ][ 'pageCount' ] handle = metadata[ 'metadata' ][ 'handleUrl' ] language = metadata[ 'metadata' ][ 'language' ] marc = metadata[ 'metadata' ][ 'htBibUrl' ] worldcat = WORLDCAT + metadata[ 'metadata' ][ 'oclc' ] # create a list and print it metadata = [ id, title, date_created, page_count, handle, language, marc, worldcat ] print( '\t'.join( map( str, metadata ) ) ) # done quit()