#!/usr/bin/ruby

# NAME
#     dul.rb - Dumb URI Lister
#
# SYNOPSIS
#
#     dul.rb <options>
#
# DESCRIPTION
#
#     dul.rb lists URIs found at a given address provided they
#     match a certain pattern.  The default behavior is to read href
#     attributes from <a> links, but it can also retrieve <img>
#     sources, etc.  Of course, matching pattern can be changed to
#     whatever one would think about.
#
#     Matching files may be downloaded, and eventually renamed on the
#     fly using another ruby-compatible regexp. Informations such as
#     user-agent or referer can also be (and are) faked.
#
#     This is only handy when you want to GET Apache fancy-indexed
#     directories and the like, but well it can do a little more.
#
#     Examples:
#
#       Download (-d) all pictures linked at (-b) http://foo/pics/
#       using http://foo/pics/ as a fake referer (-r).
#
#         dul.rb -drb http://foo/pics/
#
#       Download all pictures linked at http://foo/pics/ plus
#       rename (-R) them locally: foo_pics0.jpg, foo_pics1.png,
#       foo_pics2.jpg ...
#
#         dul.rb -R 's/.*\.(.*)$/foo_pics\C.\1/' -drb http://foo/pics/
#
#       Download all text files (-m) linked at http://foo/articles.html
#
#         dul.rb -m 'txt$' -drb http://foo/articles.html
#
#       Download all pictures linked, or shown (look at <img> tags as
#       well as <a>) at http://foo/gallery.php
#
#         dul.rb -t a,img -drb http://foo.gallery.php
#
# IDEAS
#
#     Handle more tags (namely embed, link, script, etc.)
#     Better regexp support, multiple regexp with -R option ?
#     Requests based on a given range
#     Recursive crawling (oh crap)
#     Cookies (?)
#
# BUGS
#
#     Ok, I'm just toying with ruby, and this is probably a dirty
#     mess. So what? :)
#
# AUTHOR
#
#     oz (@tuxaco dot net)
#     http://oz.tuxaco.net

require 'open-uri'
require 'uri'
require 'getoptlong'
require 'html-parser'
require 'formatter'
require 'progressbar'

class Config

  attr_reader   :baseUrl, :match, :http_proxy, :userAgent, :referer,
                :tags, :keepRelative, :download, :localRename, :sleep_time
  attr_accessor :fileCount

  def initialize
    @baseUrl = ''
    @match   = '(png|jpg|jpeg|gif)$'
    @http_proxy = ENV['http_proxy']
    @userAgent = 'Mozilla/5.0 (Windows i686) Gecko/20050517 Firefox/1.0.4';
    @referer = false
    @localRename = nil
    @tags = [ 'a' ]
    @keepRelative = false
    @download = false
    @fileCount = 0
    @sleep_time = 0
  end

  def usage
    STDERR.print File.basename($0), " <options>\n",
      "    -h, --help,                   Show help and exit\n",
      "    -b, --base-url <url>,         Set base URL\n",
      "    -d, --download,               Download files\n",
      "    -i, --interval <seconds>,     Wait n seconds between downloads\n",
      "    -m, --match <pattern>,        Change file match expression\n",
      "    -R, --rename',                Rename local file with regex\n",
      "    -t, --tag <tag1,tag2,...>,    Interesting tags\n",
      "    -u, --user-agent <agent>,     Change User-agent\n",
      "    -r, --referer',               Use base URL a referer\n",
      "    -k, --keep-rel',              Keep relative URIs\n"
  end

  def parse_cmdline
    opt_parser = GetoptLong.new
    opt_parser.set_options(
      [ '--help',       '-h', GetoptLong::NO_ARGUMENT ],
      [ '--download',   '-d', GetoptLong::NO_ARGUMENT ],
      [ '--interval',   '-i', GetoptLong::REQUIRED_ARGUMENT ],
      [ '--referer',    '-r', GetoptLong::NO_ARGUMENT ],
      [ '--keep-rel',   '-k', GetoptLong::NO_ARGUMENT ],
      [ '--base-url',   '-b', GetoptLong::REQUIRED_ARGUMENT ],
      [ '--match',      '-m', GetoptLong::REQUIRED_ARGUMENT ],
      [ '--user-agent', '-u', GetoptLong::REQUIRED_ARGUMENT ],
      [ '--tags',       '-t', GetoptLong::REQUIRED_ARGUMENT ],
      [ '--rename',     '-R', GetoptLong::REQUIRED_ARGUMENT ]
      )

    begin
      opt_parser.each_option do |optname, optargs|
      case optname
      when '--help'
        usage
        exit 1
      when '--base-url'
        if optargs =~ /^https?:\/\/.+/
          @baseUrl = URI.parse(optargs)
        else
          STDERR.puts "Invalid URL: #{optargs}"
          usage
          exit 128
        end
      when '--download'
        @download = true
      when '--tags'
        optargs = optargs.gsub /\s*/, ''
        if optargs =~ /^(a,?|img,?)+$/
          @tags = optargs.split ','
        else
          STDERR.puts "Invalid tags: #{optargs}"
          exit 42
        end
      when '--match'
        @match = (optargs  =~ /^\/.*\/$/) ? optargs[1..-2] : optargs
      when '--rename'
        if (optargs  !~ /^s\/.*?\/.*?\/$/)
          STDERR.puts "Invalid rename expression: #{optargs}"
          exit 43
        else
          @localRename = optargs
        end
      when '--user-agent'
        @userAgent = optargs
      when '--interval'
        if (optargs !~ /^[0-9]+$/)
          STDERR.puts "Invalid interval: #{optargs}"
          exit 44
        end
        @sleep_time = optargs
      when '--referer'
        @referer = true
      when '--keep-rel'
        @keepRelative = true
      end
    end
    rescue GetoptLong::AmbigousOption, GetoptLong::NeedlessArgument,
           GetoptLong::MissingArgument, GetoptLong::InvalidOption
      usage
      exit 1
    end

    if @baseUrl == ''
      STDERR.puts "I need a base URL.\n"
      usage
      exit 128
    end
  end
end

# Local HTML Parser
class DumbHTMLParser < HTMLParser

  attr_reader :a, :img, :match

  def initialize(regex)
    super(NullFormatter.new)
    @img = []
    @a = []
    @match = regex
  end

  def do_img(attrs)
    for attrname, value in attrs
      if attrname == 'src'
        # no quotes
        value = value[1..-2]
        @img.push(value) if value.match(/#{@match}/i)
      end
    end
  end

  def start_a(attrs)
    for attrname, value in attrs
      if attrname == 'href'
        value = value[1..-2]
        @a.push(value) if value.match(/#{@match}/i)
      end
    end
  end
end

# ------------------------------------------------------------------[ helpers ]
# get URI content
def fetch(uri, config)
  open(uri,
    "User-Agent" => config.userAgent,
    "Referer"    => (config.referer) ? config.baseUrl.to_s : "").collect.to_s
  rescue Exception
    STDERR.puts "Can't fetch: #{uri}\n" + $!
end

# Return absolute form of url
def getAbsUrl(url, baseUrl)
  return "#{baseUrl.scheme}://#{baseUrl.host}#{url}" if (url[0].chr == "/")
  baseUrl.to_s.gsub(/(.*)\/.*$/, '\1') + "/" + url
end

# Find new unique file name adding an offset to the original
def findNewName(file)
  return file unless file
  offset = 1
  loop do
    break if !File.exist?("#{file}-#{offset}")
    offset += 1
  end
  "#{file}-#{++offset}"
end

# Rename file using sed-type regexp (s/foo/bar/)
#   \1 ..\n can be used to interpolate as usual
#   \C      is replaced by current file count
def rename(fileName, config)
  rx = config.localRename
  pattern = rx.gsub(/^s\/(.*?)\/(.*)\/$/, '\1')
  replcmt = $2

  # Chances are you want this; when matching against '.*' ruby 1.8 has
  # a strange behaviour on regexes. *sigh* Perl would be prettier there :p
  pattern = '^.*$' if ( pattern == '.*' )

  replcmt.gsub!(/\\C/, config.fileCount.to_s) if ( replcmt.include? "\\C" )
  fileName.gsub(/#{pattern}/, replcmt)
end

# Download a file, showing a text progress bar.
def downloadFile(url, config)
  localName = url.gsub(/.*\//, '')
  localName = rename(localName, config) if ( config.localRename )
  localName = findNewName(localName)    if ( File.exist? localName  )
  localFile = open(localName, 'w')
  pbar = nil
  config.fileCount += 1

  remoteFile = open(url,
    "User-Agent"          => config.userAgent,
    "Referer"             => (config.referer) ? config.baseUrl.to_s : "",
    :content_length_proc  => lambda { |t|
      if t && 0 < t
        pbar = ProgressBar.new(localName, t)
        pbar.file_transfer_mode
      end
    },
    :progress_proc => lambda { |s|
      pbar.set s if pbar
    })
    localFile.write(remoteFile.collect)
    localFile.close
    pbar.finish
    rescue Exception
      STDERR.puts "Can't download file: #{localName}\n" + $!
end

# ----------------------------------------------------------------------[ core ]
# Set config from cmd line
config = Config.new
config.parse_cmdline

# Parse, and print/download
data = fetch(config.baseUrl, config)
if data
  parser = DumbHTMLParser.new(config.match)
  parser.feed(data)
  parser.close

  config.tags.each do |tag|
    meth = parser.method(tag)
    meth.call.each do |url|
      absUrl = url
      if (!config.keepRelative) && (url !~ /^(ftp|https?):\/\//)
        absUrl = getAbsUrl(url, config.baseUrl)
      end
      puts absUrl if not config.download
      downloadFile(absUrl, config) if config.download
      sleep config.sleep_time      if config.sleep_time
    end
  end
end