#!/usr/bin/env python # updated 12/12/06 # catalog_processor.py Jim Tuttle 10/2006 http://braggtown.com # Purpose: Create pdf files for ingest into DSpace institutional repository using scanned images # of course catalogs and database text file dump. # Background: The Special Collections department in the NCSU Libraries was tasked with digitizing historic course catalogs and ingesting them into # a DSpace digital repository. The catalogs were scanned into tif images and metadata for the images was entered into an Access database. The # Access database metadata was dumped to two tab-delimited text files, one used to process the images and create DSpace Simple Ingest Items and # the other to populate the Qualified Dublin Core metadata files for each item. # Usage: This script takes several command line arguments, or flags, to inform the script of key locations. # The -t flag should direct the script to the directory containing the database-dumped text files to process. # The -i flag is used to inform the script of the path to the directory containing the images to be processed. # The -o flag should point to an output directory which may not yet exist. The script will create it if it doesn't yet exist. It won't create # multiple levels of directories, however. The -v flag actives verbose output, which prints information to the terminal about the activities # of the script. See the following example: # ./catalog_processor.py -t /media/USB_DRIVE/course_catalogs/text_files -i /media/USB_DRIVE/course_catalogs -o /home/jjtuttle/Desktop/junk -v # In this example, notice that the /home/jjtuttle/Desktop directory exists, but the /home/jjtuttle/Desktop/junk directory does not. The script # will read the text files in /media/USB_DRIVE/course_catalogs/text_files, will process the images in /media/USB_DRIVE/course_catalogs, and will # output the pdf's, metadata, and contents files to /home/jjtuttle/Desktop/junk. It will also print verbose output to the terminal. # Notice that the metadata dump file assigned to the mdfile variable is hard-coded. This variable value may be changed if necessary. Also, the # column value assignments and dublin core values, in the readFile and doMetadata functions respectively, are very situation dependent. They # may easily be changed to extend the use of the script to other projects. # Requirements: This script relies on imagemagick to convert the images to pdf's. This script was written and tested on Linux and it is not # known if it successfully executes on other platforms. It was written in Python 2.4, though it may work with other versions. # Issues outstanding: tiff2pdf, used to convert the tif images to multi-image pdf's, does not perform optical character recognition, or OCR, # thus the pdf's are images and are therefore not indexable text. It is assumed that following the completion of this script a user will # use Adobe Acrobat to batch OCR the pdf's. # Personal notes: # Had to scrap imagemagick convert. Took ages and delivered pdf's 10X larger than sum of tiff images. Using tiff2pdf and tiffcp from libtiff-tools Debian package. from optparse import OptionParser from os.path import join, split, exists from os import walk, mkdir, listdir, remove from commands import getoutput from re import sub from time import time from sys import exit # main. Captures arguments passed on the command line. def main(): parser = OptionParser(usage="%prog [options]", version="%prog 0.1") parser.add_option("-t", "--text", dest="text", help="path to text files", metavar="path") parser.add_option("-i", "--images", dest="images", help="path to images", metavar="path") parser.add_option("-o", "--output", dest="outdir", help="path to output directory", metavar="path") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="print messages to terminal") (options, args) = parser.parse_args() if not options.text: parser.error("text directory must be declared") if not options.images: parser.error("image directory must be declared") if not options.outdir: parser.error("output directory must be declared") else: textdir = options.text imagedir = options.images outdir = options.outdir if not exists(outdir): mkdir(outdir) if options.verbose == True: verbosity = True else: verbosity = False ############################### mdfile = "DSpace_DublinCore.txt" # set name of metadata dump file btime = time() # process begin time imgcount = pdfcount = 0 # counters for images and pdfs processed textlist = textList(textdir) # list of text files to parse for textfile in textlist: if not (textfile == mdfile): # parse structure file full_list, collection_list = readFile(textdir, textfile) # list of elements and collections from structure file checkImagesExist(imagedir, full_list) # quick check to see if images can be found for collection in collection_list: destdir = join(outdir, collection) mkdir(destdir) # create destination directory for each collection this_collection = isolateCollection(collection, full_list) # list of images only in this collection uniqueDivSeqList = getDivSeq(this_collection) # list of DivisionSequenceNumbers, unique only within collection for divSeq in uniqueDivSeqList: # for each unique sequence number: pdfcount += 1 # add one to the pdf counter pagelist, timgcount = getPageList(divSeq, this_collection) # get list of pages in pdf imgcount += timgcount # add images in pdf to total image count workdir = getWorkDir(imagedir, pagelist[0][6]) # find image directory for those images makePdf(workdir, pagelist, destdir, collection, verbosity) listContents(outdir, collection_list) doMetadata(textdir, mdfile, imagedir, outdir, verbosity) summarize(btime, imgcount, pdfcount) # check if all images referenced in structure file exist, otherwise warn user def checkImagesExist(imagedir, full_list): warnings = "" for image in full_list: subdir, number = image[6].split('_') image_path = (join(imagedir, join(subdir, image[6]))) if not exists(image_path): warnings += "%s could not be found. \n" % (image_path) if not warnings == "": print "Warning! The following images are referenced in the structure file but could not be found." print warnings input = raw_input("Press y to continue or n to quit, then enter ") if input == 'y': pass else: exit() # give list of all images, return a list of only the images in the collection specified by the collection parameter. highly inefficient. def isolateCollection(collection, full_list): this_collection = [] for line in full_list: x, y = line[6].split('_') # get collection name of image if x == collection: this_collection.append(line) return this_collection # generate list of text files containing processing instructions def textList(textdir): textlist = [] for root, dirs, files, in walk(textdir): for filename in files: stem, ext = filename.split('.', 1) if ext == "txt": textlist.append(filename) return textlist # iterate through directories and write manifest to contents file def listContents(outdir, collection_list): for collection in collection_list: directory = join(outdir, collection) contentslist = listdir(directory) f = open(join(directory, 'contents'), 'w') for line in contentslist: f.write(line + '\n') f.close # return the full diretory path to images def getWorkDir(imagedir,filename): x, y = filename.split("_") return join(imagedir, x) # clean up filenames and make composite pdf. depends on ImageMagick def makePdf(workdir, pagelist, destdir, collection, verbosity): filenames ="" for page in pagelist: pageloc = join(workdir, page[6]) filenames += pageloc + " " if verbosity: print "Converting ", page[6] divtitle = sub('\W', '', page[2]) # strip non-alpha characters pdfname = page[3] + "_" + divtitle[:40] + ".pdf" # trim string > 40 characters tmptiff = join(destdir, page[3] + "_" + divtitle[:40] + ".tif") pdfout = join(destdir, pdfname) btime= time() if verbosity: print "Appending to ", pdfname catCommand = "tiffcp %s %s" % (filenames, tmptiff) # concatenate tiff images # what if only 1? #print "catCommand is: ", catCommand # testing pdfCommand = "tiff2pdf -z -q 50 -o %s %s" % (pdfout, tmptiff) # convert multi-image tiff to pdf #print "pdfCommand is: ", pdfCommand # testing output1 = getoutput(catCommand) output2 = getoutput(pdfCommand) if output1 and verbosity or output2 and verbosity: print output1, output2 print "" remove(tmptiff) print "Completed in ", time() - btime, " seconds." print "" # return list of all pages in specified Division Section def getPageList(divSeq, full_list): pagelist =[] timgcount = 0 for element in full_list: if element[3] == divSeq: pagelist.append(element) timgcount += 1 return pagelist, timgcount # return list of unique DivisionSequenceNumbers def getDivSeq(full_list): uniqueList = [] for element in full_list: if element[3] not in uniqueList: uniqueList.append(element[3]) uniqueList.sort() return uniqueList # read text file and return list of lists of lines def readFile(textdir, textfile): file_loc = join(textdir, textfile) f = open(file_loc, 'r') full_list = [] year_list = [] collection_list = [] lcount = 0 for line in f.readlines(): try: # try to assign column members to variables lcount += 1 divType, divNumber, divTitle, divSeq, pageNum, pageSeq, filename, desc, partOf = line.split('\t') except ValueError: # throw error if number of columns is wrong print "Error! Wrong number of columns on line ", lcount print "Failing line was '%s'" % (line) exit() if (filename.rfind("tif")) and not ((filename == "pageFilename") or (filename =="filename")): # test to catch non-data header linelist = [divType, divNumber, divTitle, divSeq, pageNum, pageSeq, filename, desc, partOf] full_list.append(linelist) #print "filename is ", filename #testing collection_name, imgnumber = filename.split('_') # get names of collections, eg '2002u' if collection_name not in collection_list: # create list of collections collection_list.append(collection_name) return full_list, collection_list # parse metadata dump file, write fields to dublin core if element exists def doMetadata(textdir, mdfile, imagedir, outdir, verbosity): file_loc = join(textdir, mdfile) f = open(file_loc, 'r') mdict = {} for line in f.readlines(): id1, id2, title, title_alt, title_alt2, creator, desc1, desc2, date, lcsh1, lcsh2, lcsh3, lcsh4, lcsh5, lcsh6, lcsh11, cov_temp, cov_spat, lang, format, typ, pub1, pub2, rel1, rel2, rights = line.split('\t') if id1 == "Identifier_Other": # skip column header pass else: if verbosity: print "processing metadata for", id1 s = " \n" if not (title == ""): s += """\t%s \n""" % (title) if not (title_alt == ""): s += """\t%s \n""" % (title_alt) if not (title_alt2 == ""): s += """\t%s \n""" % (title_alt2) if not (id1 == ""): s += """\t%s \n""" % (id1) if not (id2 == ""): s += """\t%s \n""" % (id2) if not (creator == ""): s += """\t%s \n""" % (creator) if not (desc1 == ""): s += """\t%s \n""" % (desc1) if not (desc2 == ""): s += """\t%s \n""" % (desc2) if not (date == ""): s += """\t%s \n""" % (date) if not (lcsh1 == ""): s += """\t%s \n""" % (lcsh1) if not (lcsh2 == ""): s += """\t%s \n""" % (lcsh2) if not (lcsh3 == ""): s += """\t%s \n""" % (lcsh3) if not (lcsh4 == ""): s += """\t%s \n""" % (lcsh4) if not (lcsh5 == ""): s += """\t%s \n""" % (lcsh5) if not (lcsh6 == ""): s += """\t%s \n""" % (lcsh6) if not (lcsh11 == ""): s += """\t%s \n""" % (lcsh11) if not (cov_temp == ""): s += """\t%s \n""" % (cov_temp) if not (cov_spat == ""): s += """\t%s \n""" % (cov_spat) if not (lang == ""): s += """\t%s \n""" % (lang) if not (format == ""): s += """\t%s \n""" % (format) if not (typ == ""): s += """\t%s \n""" % (typ) if not (pub1 == ""): s += """\t%s \n""" % (pub1) if not (pub2 == ""): s += """\t%s \n""" % (pub2) if not (rel1 == ""): s += """\t%s \n""" % (rel1) if not (rel2 == ""): s += """\t%s \n""" % (rel2) if not (rights == ""): s += """\t%s \n""" % (rights.rstrip('\n')) s += " \n" dumpdir = join(outdir, id1) if not exists(dumpdir): mkdir(dumpdir) f2 = open(join(dumpdir, 'dublin_core.xml'), 'w') f2.write(s) f2.close() # print summary of time to complete, and number of images convert to number of pdf's def summarize(btime, imgcount, pdfcount): proctime = time() - btime if proctime > 60: print "%s images were converted into %s pdf's in %.2f minutes." % (imgcount, pdfcount, proctime/60) else: print "%s images were converted into %s pdf's in %.2f seconds." % (imgcount, pdfcount, proctime) if __name__ == "__main__": main()