TumblrImageDownloader.js

'use strict';

const request = require('request-promise-any');
const cheerio = require('cheerio');
const ProxyAgent = require('proxy-agent');
const _ = require('lodash');
const EventEmitter = require('eventemitter3');

/**
 * The default user agent that will be used with all XHR requests.
 * Is a mobile user agent to ensure Tumblr sends a mobile-formatted page.
 * 
 * @constant
 * @type {string}
 * @default 
 */
const TUMBLR_MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1";

/**
 * The default user agent that will be used with non-XHR requests.
 * 
 * @constant
 * @type {string}
 * @default 
 */
const TUMBLR_USER_AGENT =  "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1";

/**
 * The default login form that will be POSTed.
 * 
 * @constant
 * @type {object}
 * @default
 */
const TUMBLR_LOGIN_FORM = Object.freeze({
	determine_email: null,
	'user[email]': null,
	'user[password]': null,
	'tumblelog[name]': "",
	'user[age]': "",
	context: "home_signup",
	version: "STANDARD",
	follow: "",
	http_referer: "https://www.tumblr.com/",
	seen_suggestion: "0",
	used_suggestion: "0", 
	used_auto_suggestion: "0",
	about_tumblr_slide: "",
	random_username_suggestions: '[""]',
});

/**
 * Transforms an http response into a cheerio object (`$`).
 * 
 * @param {string} body - Body of the response.
 * @returns {any} - Cheerio object.
 * @private
 */
function transform_cheerio (body) { return cheerio.load(body); }

/**
 * This class contains methods that can download photos from a Tumblr blog.
 * 
 * @extends {EventEmitter}
 */
class TumblrImageDownloader extends EventEmitter {
	/**
	 * Options that can be passed to the constructor
	 * @typedef TumblrImageDownloaderOptions
	 * 
	 * @property {CookieJar} [cookie_jar] - A {@link https://bit.ly/2Oq89f0|tough-cookie} compatiable cookie jar. The CookieJar object must be created with `looseMode` set to `true`.
	 * @property {string} [user_agent=TUMBLR_USER_AGENT] -  The user-agent that will be used for desktop requests.
	 * @property {string} [mobile_user_agent=TUMBLR_MOBILE_USER_AGENT] -  The user-agent that will be used for mobile requests.
	 * @property {string} [proxy_url] - URL to a proxy (SOCKS,HTTP or Pac) that will be used with each request. Will be passed to {@link https://bit.ly/2Qz8vSj|proxy-agent}
	 */

	 /**
	  * Creates a `TumblrImageDownloader` object.
	  * @param {TumblrImageDownloaderOptions} options - Options that can be passed to the constructor. All are optional.
	  */
	constructor(options) {
		super();

		let { cookie_jar, user_agent, proxy_url, mobile_user_agent } = options;

		/**
		 * The request `jar` object that will be used with each request.
		 * Is a wrapper for {@link TumblrImageDownloader#cookies} so that `TumblrImageDownloader.cookies == TumblrImageDownloader.jar._jar`.
		 * @type {RequestJar}
		 * @public
		 */
		this.jar = request.jar(cookie_jar);

		/**
		 * The agent that will be used with each request.
		 * @public
		 */
		this.agent = proxy_url ? new ProxyAgent(proxy_url) : void(0);
	
		/**
		 * The user-agent that will be used with each desktop request.
		 * @public
		 * @type {string}
		 * @default TUMBLR_USER_AGENT;
		 */
		this.user_agent = user_agent || TUMBLR_USER_AGENT;

		/**
		 * The user-agent that will be used with XHR requests.
		 * @public
		 * @type {string}
		 * @default TUMBLR_MOBILE_USER_AGENT;
		 */
		this.mobile_user_agent = mobile_user_agent || TUMBLR_MOBILE_USER_AGENT;

		/**
		 * The headers that will be sent with all non-xhr requests.
		 * @public
		 * @type {Object}
		 */
		this.headers = {
			'User-Agent': this.user_agent
		};

		/**
		 * {@link https://bit.ly/2guFWYe|request-promise-any} object that will be used for each request.
		 * @public
		 * @type {Object}
		 */
		this.request = request.defaults({
			jar: this.jar,
			agent: this.agent,
			headers: this.headers
		});

		/**
		 * The login form that will be posted when {@link TumblrImageDownloader#login} is called, excluding the credentials and CSRF.
		 * @public
		 * @type {Object}
		 */
		this.login_form_template = _.cloneDeep(TUMBLR_LOGIN_FORM);
	}

    /**
     * The cookies that will be sent with each request.
     * 
	 * @param {CookieJar} value- A {@link https://bit.ly/2Oq89f0|tough-cookie} compatible `CookieJar` object.
     * @returns {CookieJar} - A {@link https://bit.ly/2Oq89f0|tough-cookie} compatible `CookieJar` object.
     */
	get cookies() {
		return this.jar._jar;
	}
    
	set cookies(value) {
		this.jar._jar = value;
	}

    /**
     * Returns the headers that will be used during XHR requests.
     * 
     * @returns {Object} - Object containing headers
     * @private
     */
	get xhr_headers() {
		return _.extend(_.clone(this.headers), {
			'User-Agent': this.mobile_user_agent,
			'X-Requested-With': 'XMLHttpRequest'
		});
	}

    /**
     * Retrieves the login form from the Tumblr login page and extracts the CSRF token. 
     * Returns the {@link TumblrImageDownloader#login_form_template} object with the CSRF token set to `form_key`.
     * 
     * @returns {Promise<Object>} - The Tumblr login form.
     * @async
     */
	async getLoginForm() {
		let $ = await this.request({
			url: `https://www.tumblr.com/login`,
			transform: transform_cheerio
		});

		let form_key = $('meta[name="tumblr-form-key"]').attr('content');
		
		let form = _.cloneDeep(this.login_form_template);
		form.form_key = form_key;

		return form;
	}

    /**
     * Posts the login form.
     * 
     * @param {Object} - The Tumblr login form.
     * @async
     */
	async postLoginForm(form) {
		let $ = await this.request({
			url: 'https://www.tumblr.com/login',
			form,
			method: 'POST',
			followAllRedirects: true,
			transform: transform_cheerio
		});

		let error_box = $('#signup_forms .error');

		if (error_box.length)
			throw new Error(error_box.text());;
	}

    /**
     * @typedef {Object} TumblrLoginResponse
     * @property {boolean} [already_logged_in] - Indicates if a session already exists for this account.  
     */

    /**
     * Login to the Tumblr account using the provided credentials.
     * 
     * @param {string} - The username to use.
     * @param {string} - The password to use.
     * @returns {Promise<TumblrLoginResponse>}
     * @async
     */
	async login(username, password) {
		let $ = await this.request({
			url: 'https://www.tumblr.com/dashboard',
			followRedirects: true,
			followAllRedirects: true,
			transform: transform_cheerio			
		});
		
		if ($('#signup_forms').length) {
			let form = await this.getLoginForm();
			_.extend(form, {
				determine_email: username,
				'user[email]': username,
				'user[password]': password,								
			});
			
			await this.postLoginForm(form);

			return { already_logged_in: false };
		} else {
			return {
				already_logged_in: true
			};
		}
	}

    /**
     * Downloads an individual photo from a Tumblr blog.
     * @param {string} url - URL of the photo to download.
     * @returns {Promise<ClientResponse>} - HTTP Response.
     * @async
     */
	async downloadPhoto(url) {
		return await this.request({
			url,
			encoding: null,
			resolveWithFullResponse: true
		});
    }
    
    /**
     * Photo in a photoset.
     * 
     * @typedef {Object} PhotosetPhoto
     * @property {string} photoId - ID of the photo
     * @property {string} photoUrl - URL of the photo
     */

     /**
      * Returns the photos in a photoset.
      * 
      * @param {string} url - URL of the photoset.
      * @returns {Promise<PhotosetPhoto[]>} - The photos in the photoset.
      * @async
      */
	async getPhotoset(url) {
		let $ = await request({
			url,
			headers: this.xhr_headers,
			transform: transform_cheerio
		});

		return $('a.photoset_photo').get().map((photoset_photo) => {
			let photoId =  $(photoset_photo).attr('id').split('photoset_link_').pop();
			let photoUrl = $('img', photoset_photo).attr('src');

			return { photoId, photoUrl };
		});
    }
    
    /**
     * Represents data on a individual photo.
     * 
     * @typedef {Object} Photo 
     * @property {string} photoId - Unique ID of the photo.
     * @property {string} photoUrl - URL of the photo.
     * @property {string[]} tags - Tags that belong to the photo.
     * @property {string} author - Original author of the photo.
     * @property {Buffer} [photoBytes] - The actual downloaded photo. 
     */

    /**
     * Retrieves all photos on a page of a blog.
     * @param {string} blogSubdomain - Subdomain of the blog.
     * @param {number} [pageNumber=1] - Page number of the blog.
     * @returns {Promise<Photo[]>}
     * @async
     */
	async getPhotos(blogSubdomain, pageNumber) {
		let page = pageNumber || 1;
		let $ = await this.request({
			url: `https://${blogSubdomain}.tumblr.com/page/${page}`,
			headers: this.xhr_headers,
			transform: transform_cheerio
		});

		let photos = $('article.photo, article.photoset').get();
		
		let process_photos = photos.map(async (photo) => {
			let photoId = $(photo).attr('data-post-id');
			let tags = ($('.tag-link', photo).get()).map((element) => $(element).text());
			let author = $('.reblog-link', photo).length ? $('.reblog-link', photo).attr('data-blog-card-username') : blogSubdomain;
			if ($(photo).is('article.photoset')) {
				let photoset_url = `https://${blogSubdomain}.tumblr.com`+$('iframe.photoset', photo).attr('src');
				let photoset_photos = await this.getPhotoset(photoset_url);
				return photoset_photos.map((photo) => {
					return _.extend(photo, { tags, author });
				});
			} else {
				let photoUrl = $('img', photo).attr('src');
				return { photoId, photoUrl, tags, author };
			}
		});

		let result = await Promise.all(process_photos);
		return _.flatten(result);
	}
    
    /**
     * Options that can be used with {@link TumblrImageDownloader#scrapeBlog}.
     * @typedef ScrapeBlogOptions
     * @property {number} [pageNumber] - Page number to start at.
     * @property {string} blogSubdomain - Subdomain of the blog to scrape from.
     * @property {boolean} [downloadPhotos=false] - Download the photos rather than just grabbing the URLs.
     * @property {boolean} [returnPhotos=false] - Returns all of the photos as an array.
	 * @property {number} [stopAtIndex] - Stop after scraping this many pages.
	 * @property {number} [stopAtPage] - Stop when this page in the blog is reached.
	 * @property {Function} [predownloadFilter] - A function that will be used to filter {@link Photo|Photos} before downloading them.
     */

    /**
     * Iterates through all pages in a blog.
     * By default photos are emitted via the {@link TumblrImageDownloader#photo} event and not resolved with the Promise.
     * Set `optioons.returnPhotos` to `true` to return photos.
     * 
     * @example
     * let downloader = new TumblrImageDownloader();
     * downlaoder.on('photo', () => { 'do something with photo' });
     * downloader.scrapeBlog({ blogSubdomain: 'blah' });
     * 
     * @param {ScrapeBlogOptions} options - Options that can be used with this method.
     * @returns {Promise|Promise<Photo[]>}
     * @async
     */
	async scrapeBlog(options) {
		if (!options.blogSubdomain)
			throw new Error("Blog subdomain not provided");
		options.pageNumber = options.pageNumber || 1;
		options.index = options.index || 0;
		let { pageNumber, index, blogSubdomain, downloadPhotos, returnPhotos } = options;
		try {
			let photos = await this.getPhotos(blogSubdomain, pageNumber);
			let photo_count = photos.length;

			if (downloadPhotos) {
				if (options.predownloadFilter) {
					photos = photos.filter(options.predownloadFilter);
				}

				let process_photos = photos.map(async (photo_info) => {
					let photo_resp = await this.downloadPhoto(photo_info.photoUrl)
					photo_info.photoBytes = photo_resp.body;
					return photo_info;
				});

				photos = await Promise.all(process_photos);
			}

			for (let photo of photos) {
				/**
				 * Fires when a photo has been scraped.
				 * @event TumblrImageDownloader#photo
				 * @type {Photo}
				 * 
				 */
				this.emit('photo', photo);
			}
			
			if (returnPhotos) {
				options.photos = (options.photos || []).concat(photos);
			}

			if (photo_count) {
				if ((options.stopAtIndex && index >= options.stopAtIndex) || (options.stopAtPage && pageNumber >= options.stopAtPage)) {
					/**
					 * Fires when the scraper has scrapped all pages.
					 * @event TumblrImageDownloader#end
					 * 
					 */
					this.emit('end');
					if (returnPhotos)
						return options.photos;
					return;
				}
				options.pageNumber++;
				options.index++;
				/**
				 * Represnets an object that will be sent with {@link TumblrImageDownloader#pageChange}
				 * @typedef PageChangeEvent
				 * @property {string} blogSubdomain - Subdomain of the blog being scraped.
				 * @property {number} pageNumber - Page the scraper is currently on in the blog.
				 * @property {number} index - How many pages have been scraped so far.
				 */
				
				/**
				 * Fires when the scraper has moved on to the next page.
				 * @event TumblrImageDownloader#pageChange
				 * @type {PageChangeEvent}
				 * 
				 */
				this.emit('pageChange', { blogSubdomain, pageNumber, index  })
				return await this.scrapeBlog(options);
			}
			else {
				this.emit('end');
				if (returnPhotos) 
					return options.photos;
			}
		} catch (error) { 
			/**
			 * Fires if an error occurs during scraping.
			 * @event TumblrImageDownloader#error
			 * @type {Error}
			 * 
			 */
			this.emit('error', error);
			throw error;
		}
	} 
	
}

/**
 * Module that contains the {@link TumblrImageDownloader} class.
 * @module tumblr-image-downloader/TumblrImageDownloader
 * @see TumblrImageDownloader
 */
module.exports = TumblrImageDownloader;