# This script generates ePub files from the books made # available by the Neal A. Maxwell Institute for # Religious Scholarship at Brigham Young University # (http://maxwellinstitute.byu.edu/). # Run the script by passing the bookid from the book's url as an argument # e.g. for _Temple and the Cosmos_ # (http://maxwellinstitute.byu.edu/publications/books/?bookid=103), # use the command'python GetBook.py 103' # Script created by Matt Turner (http://guavaduck.com/) import urllib import re import os import zipfile import glob import shutil import sys import tidy import codecs import platform class URLopener(urllib.FancyURLopener): version = 'GetBook.py/0.1 '+platform.platform()+' (+http://guavaduck.com/)' urlopener=URLopener() print '--' print 'Maxwell Institute Book to ePub Converter' print 'by Matt Turner (http://guavaduck.com/)' print '--' print '' if len(sys.argv)>1: book_id=sys.argv[1] else: book_id=raw_input('Book id? ') book_url='http://maxwellinstitute.byu.edu/publications/books/?bookid='+book_id f=urlopener.open(book_url) #f=open('sample_book.html') s=f.read() #book_title=re.search('(?<=)',s).group(0) book_header=re.search('(?<=).+?(?=)',s).group(0) book_title=re.search('^.+(?= by )',book_header).group(0) book_author=re.search('(?<='+book_title+' by ).+$',book_header) #book_author=re.search('(?<=)',s) if book_author: book_author=book_author.group(0) else: book_author="Unknown" #book_author=re.search('(?<=)',s).group(0) #book_id=re.search('(?<=)',s).group(0) print 'Retrieving _'+book_title+'_ by '+book_author chapters=re.findall('(?<=\?bookid='+book_id+'&chapid=)[0-9]+(?=">)',s) n_chapters=len(chapters) chapter_titles=['']*n_chapters chapter_texts=['']*n_chapters tidy_options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding='utf8') #tidy_options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0) def chapterlink(matchobj): return 'chapter'+str(chapters.index(re.sub('http.+=','',matchobj.group(0))))+'.xhtml' for n in range(n_chapters): #for n in range(1): print 'Retrieving chapter '+str(n+1)+' of '+str(n_chapters) chapter_url=book_url+'&chapid='+chapters[n] f=urlopener.open(chapter_url) # f=open('sample_chapter.html') s_chapter=unicode(f.read(),'utf8','ignore') # chapter_titles[n]=re.search('(?<=)',s_chapter).group(0) chapter_titles[n]=re.search('(?<='+book_title+' - ).+?(?=)',s_chapter).group(0) chapter_text=re.findall("
[\s\S]*?(?=
)",s_chapter)[0] chapter_text=re.sub('http.+?chapid=([0-9]+)',chapterlink,chapter_text) chapter_texts[n]=re.sub('\n','\n\t\t',chapter_text) f.close() try: print ' '+chapter_titles[n] except: print ' '+chapter_titles[n].encode('ascii', 'replace') book_path=re.sub('[^a-zA-Z0-9\-_.() ]',' ',book_title)+'.'+book_id book_path0=book_path+'' n_path=0 while os.path.exists(book_path) or os.path.exists(book_path+'.epub'): n_path+=1 book_path=book_path0+'.'+str(n_path) os.mkdir(book_path) os.chdir(book_path) f=open('mimetype','w') f.write('application/epub+zip') f.close() os.mkdir('META-INF') os.chdir('META-INF') f=open('container.xml','w') f.write(''' ''') f.close() os.chdir('..') os.mkdir('OEBPS') os.chdir('OEBPS') f=codecs.open('content.opf','w','utf-8') f.write(''' '''+book_title+''' '''+book_author+''' en-US urn:uuid:'''+book_url+''' ''') for n in range(n_chapters): f.write(''' ''') f.write(''' ''') for n in range(n_chapters): f.write(''' ''') f.write(''' ''') f.close() f=codecs.open('toc.ncx', 'w', 'utf-8') f.write(''' '''+book_title+''' Title Page ''') for n in range(n_chapters): f.write(''' '''+chapter_titles[n]+''' ''') f.write(''' ''') f=codecs.open('title.xhtml','w','utf-8') f.write(str(tidy.parseString('\n\t\n\t\t'+book_title+'\n\t\n\t\n\t\t

'+book_title+'

\n\t\t

by '+book_author+'

\n\t\n', **tidy_options))) f.close() for n in range(n_chapters): f=open('chapter'+str(n)+'.xhtml','w') f.write(str(tidy.parseString(('\n\t\n\t\t'+chapter_titles[n]+'\n\t\n\t\n\t\t

'+chapter_titles[n]+'''

\n\t\t'''+chapter_texts[n]+'\n\t\n').encode('utf-8'), **tidy_options))) f.close() os.chdir('../..') file = zipfile.ZipFile(book_path+'.epub', "w") os.chdir(book_path) file.write('mimetype','mimetype',zipfile.ZIP_STORED) file.write('META-INF/container.xml','META-INF/container.xml',zipfile.ZIP_DEFLATED) #file.write('META-INF/container.xml','META-INF/container.xml',zipfile.ZIP_STORED) for name in glob.glob("OEBPS/*"): file.write(name,name,zipfile.ZIP_DEFLATED) # file.write(name,name,zipfile.ZIP_STORED) file.close() os.chdir('..') shutil.rmtree(book_path)