#!/usr/bin/ruby -w # Copyright (C) 2017 Moritz Orbach # https://apfelboymchen.net/gnu/4chan/ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # 4chan image downloader # Ruby # # birth: Di 21. Mär 16:08:32 CET 2017 # This is my first ruby program. # # https://github.com/4chan/4chan-API # # v0.9.1 2017-04-04 # module FourChan require 'json' require 'net/http' require 'cgi' include Comparable def download(apiurl) #apiurl = "/tmp/2866390.json" # debug begin open(apiurl).read rescue => e # checking the string is ugly, but this seems to be the only way in # open-uri if e.message =~ /^404 / raise "404'd" else raise "download failed: " + e.message end end end # ## 4chan post # class FCPost attr_reader :values, :postnumber attr_reader :imgbasename, :imgfilename, :oimgfilename, :savefilename attr_reader :httpstatus, :img_saved HARDCFG_imgsrv = "i.4cdn.org" REQUIRED_POST_ATTRIBUTES = [ "no", "now", "resto", "time" ] OPTIONAL_POST_ATTRIBUTES = [ "closed", "archived", "archived_on", "capcode", "capcode_replies", "changes", "com", "country", "country_name", "custom_spoiler", "ext", "filedeleted", "filename", "fsize", "h", "id", "last_modified", "md5", "name", "omitted_images", "omitted_posts", "since4pass", "spoiler", "sticky", "sub", "tag", "tn_h", "tn_w", "trip", "w", # required as per documentation, but actually not always there! "tim", # image timestamp and filename # thread attributes in OP "replies", "images", "bumplimit", "imagelimit", "semantic_url", ] def initialize(proto, board, posthash) @proto = proto @board = board @values = Hash.new # values of the post @img_saved = false # https://i.4cdn.org/hr/1489025400296.jpg # ends with "/" so that the filename can be appended easily @imgbaseurl = "#{@proto}://#{HARDCFG_imgsrv}/#{board}/" # gather all attributes. required … REQUIRED_POST_ATTRIBUTES.each do |attribute| if posthash[attribute] == nil # XXX || raise ?? raise "invalid JSON: attribute missing: " + attribute end @values[attribute] = posthash[attribute] end # … and optional OPTIONAL_POST_ATTRIBUTES.each do |attribute| @values[attribute] = posthash[attribute] end @postnumber = @values["no"] if self.has_img? # image checks @imgbasename = @values['tim'].to_s @imgfilename = "#{@values['tim']}#{@values['ext']}" @oimgfilename = "#{@values['filename']}#{@values['ext']}" @oimgfilename = CGI.unescape(@oimgfilename) @savefilename = @imgbasename + " - " + @oimgfilename if (@values["fsize"].to_s !~ /^\d+$/) raise "invalid JSON: image size contains garbage" end @imgsize = @values["fsize"] end end def is_op? @values["resto"] == 0 # OP replies to postnumber 0 end def has_img? @values["filename"] != nil end def imgsize @imgsize end # save the image of the post in a file # &blk gets executed on success and error def imgsave(&blk) if ! self.has_img? raise "programming error: downloading image when post doesn't have one" end begin # must be checked inside the ensure block # File.size(@savefilename) == @values["fsize"] # doesn't work because fsize sometimes contains the wrong size! if File.exist?(@savefilename) && File.size(@savefilename) > 0 raise Errno::EEXIST end # download # https://i.4cdn.org/hr/1489025400296.jpg f = File.new(@savefilename, "w") uri = URI(@imgbaseurl + @imgfilename) # FIXME TLS soll eigentlich automatisch gehen, tuts aber nicht # kann man das use_ssl für http lassen? Net::HTTP.start(uri.host, uri.port, :use_ssl => true) do |http| request = Net::HTTP::Get.new uri http.request request do |response| # Net::HTTPResponse object #if response.class eql? Net::HTTPNotFound @httpstatus = response.code raise "server returned #{@httpstatus}" if @httpstatus =~ /^4\d*/ response.read_body do |segment| f.write(segment) end end end rescue Errno::EEXIST # not really an error, may be rescued by caller raise rescue File.delete(@savefilename) raise else @img_saved = true ensure f.close if f != nil yield @savefilename end end def <=>(other) #puts "\t#{self.postnumber} <=> #{other.postnumber}" # debug self.postnumber <=> other.postnumber end end # ## 4chan thread # class FCThread require 'open-uri' include Comparable # XXX nötig? include FourChan HARDCFG_boardsrv = "boards.4chan.org" HARDCFG_apisrv = "a.4cdn.org" attr_reader :url, :proto, :board, :num, :op, :max, :lastpost # infos about thread (found in OP) @@thread_attribute_keys = [ "replies", "images", "bumplimit", "imagelimit", "semantic_url" ] def initialize(threadurl) @url = threadurl @posts = [] @apiurl = "" @current_post = 0 @attributes = Hash.new properties_from_url(@url) # set global properties json = JSON.parse(download(@apiurl)) # a thread is a hash with a single "posts" key that contains an array of # all posts json["posts"].each do |posthash| @posts.push FCPost.new(@proto, @board, posthash) end @posts.sort! # should already be sorted, but better be safe # set some more metadata @op = @posts.min @max = @posts.max @lastpost = @posts.max @@thread_attribute_keys.each do |a| if @op.values[a] == nil # XXX || raise ?? raise "invalid JSON: attribute missing in OP: " + a end @attributes[a] = @op.values[a] end end # returns every post in the thread def each(&blk) @posts.each(&blk) end # some interesting information about the thread def name @attributes["semantic_url"] end def replies @attributes["replies"] + 1 # OP does not count in JSON end def images @attributes["images"] + 1 # OP does not count in JSON end # ? #def succ # puts "succ!" # @iterate_index = @iterate_index + 1 # @posts[@iterate_index] #end # returns a range of posts (by postnumber) def [](range) results = Array.new from = range.first > 0 ? range.first : @op.postnumber # translate 0 to first post to = range.last > -1 ? range.last : @lastpost.postnumber # translate -1 to last post @posts.each do |p| postnumber = p.postnumber #puts "#{from}->#{to} vs. #{postnumber}" # debug if postnumber >= from if postnumber <= to #puts "push #{postnumber}" # debug results.push p else break # nothing more to give end end end results end private # set global properties from threadurl def properties_from_url(threadurl) # https://boards.4chan.org/hr/thread/2866390 re = Regexp.new("^\\s*(https?)://#{HARDCFG_boardsrv}/+(.+?)/thread/+(\\d+)\\s*$", Regexp::IGNORECASE) if ! match = re.match(threadurl) raise "invalid 4chan URL: #{threadurl}" end # https://a.4cdn.org/hr/thread/2866390.json @proto, @board, @num = match[1], match[2], match[3] @apiurl = "#{@proto}://#{HARDCFG_apisrv}/#{@board}/thread/#{@num}.json" end end end # Download images of a 4chan thread # outputs its progress class FCImgdownloader attr_reader :downloaded, :ignored, :failed, :alreadythere, :saved_filenames def initialize(fcthread, from_postnumber, img_regex) # FCThread, postnumber, Regexp @downloaded = 0 @ignored = 0 @failed = 0 @alreadythere = 0 @saved_filenames = Array.new @fcthread = fcthread @from_postnumber = from_postnumber @img_regex = img_regex end def download(&blk) @fcthread.each do |post| next if !post.postnumber.between?(@from_postnumber + 1, @fcthread.max.postnumber) # don't include last post next if !post.has_img? if @img_regex != nil && !@img_regex.match(post.oimgfilename) #puts "ignore #{post.oimgfilename} !~ #{@img_regex}" # debug @ignored += 1 next end last_filename = String.new msg = String.new begin post.imgsave do |saved_filename| # just collect the filename for user output and @saved_filenames last_filename = saved_filename end # Exceptions rescue Errno::EEXIST @alreadythere += 1 msg = "#{last_filename} (already downloaded)" rescue @failed += 1 msg = "#{last_filename} failed: #{$!.message}" # no raise, just continue with the next image else msg = last_filename @saved_filenames.push last_filename @downloaded += 1 ensure # user output puts msg end end # each post end # download end # class # ## main # require 'optparse' HARDCFG_statusfile = "4cdl.status" HARDCFG_saucefile = "hier.her" # returns # - the last known post based on statusfile # - 0 if there are no known posts def from_post(statusfile) begin f = File.new(statusfile, "r") rescue Errno::ENOENT return 0 else postnumber = f.read f.close if postnumber =~ /^\s*(\d+)\s*$/ return $1.to_i else raise "invalid postnumber in #{statusfile}" end end end def save_to_file(statusfile, postnumber) begin f = File.new(statusfile, "w") f.write(postnumber.to_s + "\n") f.close rescue => e raise "error writing #{statusfile}: #{e.message}" end end # XXX maybe GetoptLong wraps long lines better? def parseoptions options = {} OptionParser.new do |opts| opts.banner = < e raise "invalid IMG_REGEX: #{e.message}" end end opts.on(nil, "--debug", "print debugging output") do |v| options[:debug] = v end opts.on("-h", "--help", "display this help") do |v| options[:help] = v puts opts exit end end.parse! if (options[:sauce] == nil) # neither user nor option set it options[:sauce] = true end # ARGV if ARGV.count > 1 raise "You can only donwload a single thread at a time" usage exit 1 end if ARGV.count == 0 raise "Please specify a 4chan thread URL" usage exit 1 end return options end begin options = parseoptions rescue => e warn e.message exit 1 end opt_url = ARGV[0] # ## main # # XXX catch ^C begin from_post = options[:img_regex] ? 0 : from_post(HARDCFG_statusfile) fcthread = FourChan::FCThread.new(opt_url) puts "#{fcthread.name} (#{fcthread.replies}/#{fcthread.images})" dl = FCImgdownloader.new(fcthread, from_post, options[:img_regex]) begin dl.download() do |saved_filame| puts saved_filename end rescue raise else if options[:sauce] save_to_file(HARDCFG_statusfile, fcthread.max.postnumber) # nicht, wenn keine bilder save_to_file(HARDCFG_saucefile, opt_url) if ! File.exist?(HARDCFG_saucefile) save_to_file("#{fcthread.num}.html", download(opt_url)) end end rescue => e # print error for the user warn e.message warn e.backtrace if options[:debug] ensure if dl != nil puts "#{dl.downloaded} downloaded, #{dl.ignored} ignored, #{dl.alreadythere} already downloaded" puts "#{dl.failed} failed" if dl.failed > 0 end end