UnixTools/packtdl.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Needs grab, pip3 install -U grab
# (also needs: libcurl4-*-dev for curl-config, libxslt1-dev and libxml2-dev)

import configparser
import logging
import os.path
import re
import selection
import sys
import unicodedata
from argparse import ArgumentParser
from grab import Grab
from io import StringIO

config_str = '[DEFAULT]\n' + open(sys.path[0] + '/CONFIG', 'r').read()
config_fp  = StringIO(config_str)

c = configparser.RawConfigParser()
c.readfp(config_fp)

logging.basicConfig(level=logging.DEBUG)

class LoggedOutException(Exception):
    '''Raise when attempting an action needing login without being logged in.'''

class PacktBook():
    def __init__(self):
        self.title = ""
        self.url = ""
        self.isbn = ""
        self.nid = ""
        self.cover_img = ""
        self.dl_pdf = ""
        self.dl_epub = ""
        self.dl_mobi = ""
        self.dl_code = ""

    def __str__(self):
        output = "["
        output += "P" if self.dl_pdf else "-"
        output += "e" if self.dl_epub else "-"
        output += "K" if self.dl_mobi else "-"
        output += "Z" if self.dl_code else "-"
        output += "] {} ({})".format(self.title, self.isbn)
        return output

    def get_safe_name(self):
        '''Returns the name of the book safe for using for file names.'''
        name = self.title
        name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
        name = re.sub('[^\w\s-]', '', name).strip()
        name = re.sub('[-\s]+', '_', name)
        return name

    def parse_from_xsel(self, book: selection.backend.XpathSelector):
        '''Parses the DOM section in `book`'''
        self.title = book.select("@title").text()
        if self.title[-8:].lower() == " [ebook]":
            self.title = self.title[:-8]
        self.url = "https://www.packtpub.com" + book.select(".//div[@class='product-top-line']/div/a/@href").text()
        self.nid = book.select("@nid").text()
        self.cover_img = book.select(".//img/@src").text().replace("imagecache/thumbview/", "")

        isbn = book.select(".//div/@isbn")
        if isbn:
            self.isbn = isbn.text()

        self.dl_pdf = "https://www.packtpub.com" + book.select(".//a[div/@format='pdf']/@href").text()

        dl_epub = book.select(".//a[div/@format='epub']/@href")
        if dl_epub:
            self.dl_epub = "https://www.packtpub.com" + dl_epub.text()

        dl_mobi = book.select(".//a[div/@format='mobi']/@href")
        if dl_mobi:
            self.dl_mobi = "https://www.packtpub.com" + dl_mobi.text()

        dl_code = book.select(".//a[starts-with(@href, '/code_download')]/@href")
        if dl_code:
            self.dl_code = "https://www.packtpub.com" + dl_code.text()


class PacktPub():
    def __init__(self):
        self.g = Grab()
        self.g.setup(follow_location=True)
        self.g.setup(follow_refresh=True)
        self.g.setup(timeout=120)
        self.g.setup(connect_timeout=10)
        #self.g.setup(body_maxsize=512000)
        self.logged_in = False

    def login(self, email, password):
        self.g.go('https://www.packtpub.com/')
        self.g.doc.save('/tmp/packtpub-home.html')
        self.g.doc.choose_form(id='packt-user-login-form')
        print("Logging in with account: {}".format(email))
        self.g.doc.set_input('email', email)
        self.g.doc.set_input('password', password)
        self.g.doc.submit()
        self.g.doc.save('/tmp/packpub-home-after-login.html')
        self.g.doc.text_assert('"sid":')
        self.logged_in = True

    def get_ebooks_list(self, url="https://www.packtpub.com/account/my-ebooks"):
        '''Loads the list of purchased ebooks and returns a Selection object with all books.'''
        if url.startswith("http") and not self.logged_in:
            raise LoggedOutException("Must be logged in before getting ebooks list!")
        self.g.go(url)
        self.g.doc.save('/tmp/packtpub-my-ebooks.html')
        self.g.doc.text_assert('<h1>My eBooks </h1>')
        all_books_xsel = self.g.doc.select('//div[@id="product-account-list"]/div[starts-with(@class, "product-line")][@title]')
        all_books = []
        for b in all_books_xsel:
            book_obj = PacktBook()
            book_obj.parse_from_xsel(b)
            all_books.append(book_obj)
        return all_books

    def download_book_all(self, book: PacktBook, destination_directory):
        '''Downloads all available files for given book to destination_directory/book_name.'''
        if not self.logged_in:
            raise LoggedOutException("Must be logged in before download!")
        base_name = book.get_safe_name()
        if not os.path.exists(destination_directory):
            os.makedirs(destination_directory, mode=0o775, exist_ok=True)
        print("Downloading PDF of {} from {}".format(base_name, book.dl_pdf))
        self.g.download(book.dl_pdf, destination_directory + "/" + base_name + ".pdf")

parser = ArgumentParser(description="List or download all purchased ebooks from your PACKT account.")
parser.add_argument("--start", help="Index to start at (default: 1)", metavar="NUMBER", type=int, dest="idx_start", required=False, default=1)
parser.add_argument("-n", "--count", help="Number of items to download, starting at --start index", metavar="COUNT", type=int, dest="count", required=False)
parser.add_argument("--end", help="Index to stop at (default: last)", metavar="NUMBER", type=int, dest="idx_end", required=False)
#parser.add_argument("--verbose", help="Verbose logging to STDERR", action="store_true")
opts = parser.parse_args()
opts = vars(opts)

p = PacktPub()
p.login(c.get('DEFAULT', 'PACKT_LOGIN'), c.get('DEFAULT', 'PACKT_PASSWORD'))
if os.path.isfile("/tmp/packtpub-my-ebooks.html"):
    all_books = p.get_ebooks_list("file:///tmp/packtpub-my-ebooks.html")
else:
    all_books = p.get_ebooks_list()

print("Found {:d} ebooks.".format(len(all_books)))

if opts["idx_start"] > 1 or opts["idx_end"] or opts["count"]:
    # Some range given: Download books
    idx_start = opts["idx_start"]
    if idx_start < 1:
        idx_start = 1

    if opts["idx_end"]:
        idx_end = opts["idx_end"]
    elif opts["count"]:
        idx_end = idx_start + opts["count"] - 1
    else:
        idx_end = len(all_books)

    if idx_end < idx_start:
        idx_end = idx_start

    if idx_end > len(all_books):
        idx_end = len(all_books)

    print("Selected range: {:d} to {:d}".format(idx_start, idx_end))

    for i in range(idx_start, idx_end+1):
        print("{:d}: {}".format(i, all_books[i-1]))
        p.download_book_all(all_books[i-1], "/tmp/packt")

else:
    # No selection made: Show list
    for i in range(0, len(all_books)):
        print("{:d}: {}".format(i+1, all_books[i]))