#!/usr/bin/env python3 # Copyright (C) 2015, 2018 Moritz Orbach # Thanks to Tyler for correcting some small but fatal errors. # # BUGS # - currently the saving of threads does not work (403) and is deactivated # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # 4chan image downloader v0.2 # Python 3 # # birth: Fr 3. Apr 13:21:05 CEST 2015 # This is my first proper python program. # # https://github.com/4chan/4chan-API # https://boards.4chan.org/hr/thread/2368597 # https://a.4cdn.org/hr/thread/2368597.json # # BUGS # - CFG_4saucefile is always written in UTF-8 (should be no problem) # # TODO # - automatically downloads files younger than the youngest existing (-> os.listdir) import argparse import re import urllib.request import json import sys import time import calendar import signal from os import path, utime, remove from html import unescape from urllib.parse import unquote CFG_boardsrv = "boards.4chan.org" CFG_imgsrv = "i.4cdn.org" CFG_apisrv = "a.4cdn.org" CFG_4saucefile = "hier.her" # set this to None or an empty string to not save the URL in a file CFG_statusfile = "4cdl.status" ## ## util ## def warn(msg): print(msg, file=sys.stderr) def signal_handler(signal, frame): print() sys.exit(0) ## ## parameters ## # https://docs.python.org/3.4/library/argparse.html#argparse.Action # would have been better: from urllib.parse import urlparse # https://docs.python.org/3.3/library/urllib.parse.html#module-urllib.parse class ParseURL(argparse.Action): # Instances of Action should have attributes “dest”, “option_strings”, # “default” etc. defined. The easiest way to ensure these attributes # are defined is to call Action.__init__. def __init__(self, option_strings, dest, nargs=None, **kwargs): if nargs is not None: raise ValueError("nargs not allowed") # XXX huh? super(ParseURL, self).__init__(option_strings, dest, **kwargs) # parser ArgumentParser object # namespace The Namespace object that will be returned by parse_args(). # Most actions add an attribute to this object using setattr(). # values The associated command-line arguments, with any type conversions applied. # option_string The option string that was used to invoke this action (optional) def __call__(self, parser, namespace, values, option_string=None): pattern = re.compile("^(https?)://" + CFG_boardsrv + "/(.+?)/.+/(\d+)$") match = pattern.match(values) if match is None: raise ValueError("invalid URL: " + values) urlinfo = {} urlinfo["url"] = match.group(0) urlinfo["proto"] = match.group(1) urlinfo["board"] = match.group(2) urlinfo["thread"] = match.group(3) urlinfo["json"] = urlinfo["proto"] + "://" + CFG_apisrv + "/" + \ urlinfo["board"] + "/thread/" + urlinfo["thread"] + ".json" setattr(namespace, self.dest, urlinfo) class ParseSinglefile(argparse.Action): files = [] def __init__(self, option_strings, dest, nargs=None, **kwargs): if nargs is not None: raise ValueError("nargs not allowed") # XXX huh? super(ParseSinglefile, self).__init__(option_strings, dest, **kwargs) def __call__(self, parser, namespace, values, option_string=None): pattern = re.compile("^(https?://.+?/.+/)?(\d+)(\..+)?$") match = pattern.match(values) if match is None: raise ValueError("invalid IMAGE: " + values) self.files.append(match.group(2)) setattr(namespace, self.dest, self.files) class ParseMinfile(argparse.Action): def __init__(self, option_strings, dest, nargs=None, **kwargs): if nargs is not None: raise ValueError("nargs not allowed") # XXX huh? super(ParseMinfile, self).__init__(option_strings, dest, **kwargs) def __call__(self, parser, namespace, values, option_string=None): pattern = re.compile("(^https?://" + CFG_imgsrv + "/.*?)?(\d+)(\..+)?$") # match = pattern.match(values) if match is None: raise ValueError("not a valid 4chan filename") setattr(namespace, self.dest, match.group(2)) class ParseImgRegexp(argparse.Action): def __init__(self, option_strings, dest, nargs=None, **kwargs): if nargs is not None: raise ValueError("nargs not allowed") # XXX huh? super(ParseImgRegexp, self).__init__(option_strings, dest, **kwargs) def __call__(self, parser, namespace, values, option_string=None): try: pattern = re.compile(values); except Exception as e: raise ValueError("invalid regex: " + str(e)) setattr(namespace, self.dest, pattern) ## ## meat ## class WebFile(): last_modified = None response = None # https://docs.python.org/3.4/library/http.client.html#http.client.HTTPResponse def __init__(self, url): #print("Download: ", url) self.get(url) def get(self, url): """HTTP-GET url""" # no try/except - just let URLError bubble upwards req = urllib.request.Request(url) self.response = urllib.request.urlopen(req) self.last_modified = self.response.getheader("Last-Modified") try: utctimestamp = time.strptime(self.last_modified, '%a, %d %b %Y %H:%M:%S %Z') self.last_modified = calendar.timegm(utctimestamp) except TypeError: # No Last-Modified?? warn(url + ": unable to get filestamp") self.last_modified = None def data(self): """Returns data from url""" return self.response.read() def save(self, filename): """Saves data from url into filename""" imgfile = "" try: imgfile = open(filename, "wb") imgfile.write(self.response.read()) imgfile.close if (self.last_modified != None): utime(filename, (self.last_modified, self.last_modified)) except OSError as e: raise Exception(filename + ": " + e.strerror) except Exception as e: raise Exception(filename + ": " + str(e.args[1])) class Download(): skipped = 0 downloaded = 0 failed = 0 alreadythere = 0 lastpic = None def __init__(self, images=None, img_regex=None, minfile=None): if minfile == None: # try to load from file if not excpliticly requested try: f = open(CFG_statusfile, "r") contents = f.readline().rstrip() try: minfile = int(contents) except ValueError: warn("invalid " + CFG_statusfile + " - downloading everything") minfile = 0 except FileNotFoundError as e: # Normal. Just load everything minfile = 0 except OSError as e: raise Exception(filename + ": " + str(e.args[1])) for post in j["posts"]: if not "filename" in post: # no picture available continue chan_filenamebase = str(post["tim"]) chan_filename = chan_filenamebase + post["ext"] uploader_filename = post["filename"] + post["ext"] # 1426910440606: Real-Nightmare-ShinheadMothafucka.jpg local_filename = chan_filenamebase + " - " + uploader_filename local_filename = unescape(local_filename) # HTML entities (& etc) local_filename = unquote(local_filename) # https://i.4cdn.org/hr/1426910440606.jpg imgurl = args.urlinfo["proto"] + "://" + CFG_imgsrv + "/" + args.urlinfo["board"] + "/" + chan_filename #print(chan_filenamebase, ", ", minfile, ", ", uploader_filename) # debug #if path.exists(local_filename): # debug if not path.exists(local_filename): if images is None and int(chan_filenamebase) <= int(minfile): # user already got this self.skipped += 1 continue if images is not None and chan_filenamebase not in images: # user doesn't want this self.skipped += 1 continue if img_regex is not None and not img_regex.search(uploader_filename): # user doesn't want this self.skipped += 1 continue print(imgurl + " -> " + local_filename) try: f = WebFile(imgurl) f.save(local_filename) self.downloaded += 1 except urllib.error.URLError as e: self.failed += 1 raise except OSError as e: raise Exception(local_filename + ": " + e.strerror) #except Exception as e: # # The web calls it an "unexcpected" error # # Try to keep the user from seeing an ugly backtrace -> index doesn't work reliably # raise Exception(local_filename + str(e.args[0])) else: self.skipped += 1 self.alreadythere += 1 self.lastpic = chan_filenamebase ## ## misc ## def saveline(filename, threadurl): threadurl += "\n" try: imgfile = open(filename, "wb") imgfile.write(threadurl.encode('utf-8')) # FIXME locale?? imgfile.close except http.client.IncompleteRead as e: raise Exception("Network error") except OSError as e: raise Exception(filename + ": " + e.strerror) # ## main # signal.signal(signal.SIGINT, signal_handler) parser = argparse.ArgumentParser(description="Grabs images from 4chan threads, maintaining new and original filenames.") parser.add_argument("urlinfo", metavar="THREADURL", type=str, action=ParseURL, help="URL for a 4chan thread") parser.add_argument("-s", "--source", action="store_true", help="saves the raw HTML of the thread and URL in two additional files. This is the default.") parser.add_argument("-m", "--minfile", type=str, action=ParseMinfile, help="load only images newer than MINFILE. MINFILE can be an image url or the filename.") parser.add_argument("-i", "--image", type=str, action=ParseSinglefile, help="download specific IMAGE from THREADURL. Can be given multiple times. This option disables the implicit --source. IMAGE can be an image URL or the filename.") parser.add_argument("-I", "--img-regex", type=str, action=ParseImgRegexp, help="Download all images where the uploader's filename matches IMG_REGEX") try: args = parser.parse_args() except ValueError as e: warn(e) sys.exit(1) # Web try: f = WebFile(args.urlinfo["json"]) except urllib.error.HTTPError as e: if e.code == 404: warn("404'd") remove(CFG_statusfile) sys.exit(1) except urllib.error.URLError as e: warn("unable to load URL: " + str(e.reason)) sys.exit(1) except: print("failed to download 4chan's JSON: ", str(sys.exc_info()[1])) sys.exit(1) # JSON try: j = json.loads(f.data().decode("utf-8")) except: warn("failed to parse 4chan's JSON") sys.exit(1) # Download #try: # d = Download(args.image, args.img_regex, args.minfile) #except Exception as e: # print("download failed:", str(e)) # sys.exit(1) d = Download(args.image, args.img_regex, args.minfile) try: saveline(CFG_statusfile, d.lastpic) except OSError as e: warn("unable to save status file" + str(e.strerror)) sys.exit(1) if args.image == None or args.source: # XXX 403's. Wrong user agent? #XXX-# hr_2374787.html #XXX-try: #XXX- f = WebFile(args.urlinfo["url"]) #XXX- f.save(args.urlinfo["board"] + "_" + args.urlinfo["thread"] + ".html") #XXX-except Exception as e: #XXX- warn("unable to save thread" + str(e) #if path.exists(CFG_4saucefile): # debug if CFG_4saucefile != "" and CFG_4saucefile != None and not path.exists(CFG_4saucefile): # save sauce try: saveline(CFG_4saucefile, args.urlinfo["url"]) except: warn("unable to save source url: ") # summary print(str(d.skipped) + " skipped (" + str(d.alreadythere) + " already downloaded)") print(str(d.downloaded) + " downloaded") if (d.failed > 0): print(str(d.failed) + " failed")