#!/usr/bin/ruby
# NAME
# dul.rb - Dumb URI Lister
#
# SYNOPSIS
#
# dul.rb <options>
#
# DESCRIPTION
#
# dul.rb lists URIs found at a given address provided they
# match a certain pattern. The default behavior is to read href
# attributes from <a> links, but it can also retrieve <img>
# sources, etc. Of course, matching pattern can be changed to
# whatever one would think about.
#
# Matching files may be downloaded, and eventually renamed on the
# fly using another ruby-compatible regexp. Informations such as
# user-agent or referer can also be (and are) faked.
#
# This is only handy when you want to GET Apache fancy-indexed
# directories and the like, but well it can do a little more.
#
# Examples:
#
# Download (-d) all pictures linked at (-b) http://foo/pics/
# using http://foo/pics/ as a fake referer (-r).
#
# dul.rb -drb http://foo/pics/
#
# Download all pictures linked at http://foo/pics/ plus
# rename (-R) them locally: foo_pics0.jpg, foo_pics1.png,
# foo_pics2.jpg ...
#
# dul.rb -R 's/.*\.(.*)$/foo_pics\C.\1/' -drb http://foo/pics/
#
# Download all text files (-m) linked at http://foo/articles.html
#
# dul.rb -m 'txt$' -drb http://foo/articles.html
#
# Download all pictures linked, or shown (look at <img> tags as
# well as <a>) at http://foo/gallery.php
#
# dul.rb -t a,img -drb http://foo.gallery.php
#
# IDEAS
#
# Handle more tags (namely embed, link, script, etc.)
# Better regexp support, multiple regexp with -R option ?
# Requests based on a given range
# Recursive crawling (oh crap)
# Cookies (?)
#
# BUGS
#
# Ok, I'm just toying with ruby, and this is probably a dirty
# mess. So what? :)
#
# AUTHOR
#
# oz (@tuxaco dot net)
# http://oz.tuxaco.net
require 'open-uri'
require 'uri'
require 'getoptlong'
require 'html-parser'
require 'formatter'
require 'progressbar'
class Config
attr_reader :baseUrl, :match, :http_proxy, :userAgent, :referer,
:tags, :keepRelative, :download, :localRename, :sleep_time
attr_accessor :fileCount
def initialize
@baseUrl = ''
@match = '(png|jpg|jpeg|gif)$'
@http_proxy = ENV['http_proxy']
@userAgent = 'Mozilla/5.0 (Windows i686) Gecko/20050517 Firefox/1.0.4';
@referer = false
@localRename = nil
@tags = [ 'a' ]
@keepRelative = false
@download = false
@fileCount = 0
@sleep_time = 0
end
def usage
STDERR.print File.basename($0), " <options>\n",
" -h, --help, Show help and exit\n",
" -b, --base-url <url>, Set base URL\n",
" -d, --download, Download files\n",
" -i, --interval <seconds>, Wait n seconds between downloads\n",
" -m, --match <pattern>, Change file match expression\n",
" -R, --rename', Rename local file with regex\n",
" -t, --tag <tag1,tag2,...>, Interesting tags\n",
" -u, --user-agent <agent>, Change User-agent\n",
" -r, --referer', Use base URL a referer\n",
" -k, --keep-rel', Keep relative URIs\n"
end
def parse_cmdline
opt_parser = GetoptLong.new
opt_parser.set_options(
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
[ '--download', '-d', GetoptLong::NO_ARGUMENT ],
[ '--interval', '-i', GetoptLong::REQUIRED_ARGUMENT ],
[ '--referer', '-r', GetoptLong::NO_ARGUMENT ],
[ '--keep-rel', '-k', GetoptLong::NO_ARGUMENT ],
[ '--base-url', '-b', GetoptLong::REQUIRED_ARGUMENT ],
[ '--match', '-m', GetoptLong::REQUIRED_ARGUMENT ],
[ '--user-agent', '-u', GetoptLong::REQUIRED_ARGUMENT ],
[ '--tags', '-t', GetoptLong::REQUIRED_ARGUMENT ],
[ '--rename', '-R', GetoptLong::REQUIRED_ARGUMENT ]
)
begin
opt_parser.each_option do |optname, optargs|
case optname
when '--help'
usage
exit 1
when '--base-url'
if optargs =~ /^https?:\/\/.+/
@baseUrl = URI.parse(optargs)
else
STDERR.puts "Invalid URL: #{optargs}"
usage
exit 128
end
when '--download'
@download = true
when '--tags'
optargs = optargs.gsub /\s*/, ''
if optargs =~ /^(a,?|img,?)+$/
@tags = optargs.split ','
else
STDERR.puts "Invalid tags: #{optargs}"
exit 42
end
when '--match'
@match = (optargs =~ /^\/.*\/$/) ? optargs[1..-2] : optargs
when '--rename'
if (optargs !~ /^s\/.*?\/.*?\/$/)
STDERR.puts "Invalid rename expression: #{optargs}"
exit 43
else
@localRename = optargs
end
when '--user-agent'
@userAgent = optargs
when '--interval'
if (optargs !~ /^[0-9]+$/)
STDERR.puts "Invalid interval: #{optargs}"
exit 44
end
@sleep_time = optargs
when '--referer'
@referer = true
when '--keep-rel'
@keepRelative = true
end
end
rescue GetoptLong::AmbigousOption, GetoptLong::NeedlessArgument,
GetoptLong::MissingArgument, GetoptLong::InvalidOption
usage
exit 1
end
if @baseUrl == ''
STDERR.puts "I need a base URL.\n"
usage
exit 128
end
end
end
# Local HTML Parser
class DumbHTMLParser < HTMLParser
attr_reader :a, :img, :match
def initialize(regex)
super(NullFormatter.new)
@img = []
@a = []
@match = regex
end
def do_img(attrs)
for attrname, value in attrs
if attrname == 'src'
# no quotes
value = value[1..-2]
@img.push(value) if value.match(/#{@match}/i)
end
end
end
def start_a(attrs)
for attrname, value in attrs
if attrname == 'href'
value = value[1..-2]
@a.push(value) if value.match(/#{@match}/i)
end
end
end
end
# ------------------------------------------------------------------[ helpers ]
# get URI content
def fetch(uri, config)
open(uri,
"User-Agent" => config.userAgent,
"Referer" => (config.referer) ? config.baseUrl.to_s : "").collect.to_s
rescue Exception
STDERR.puts "Can't fetch: #{uri}\n" + $!
end
# Return absolute form of url
def getAbsUrl(url, baseUrl)
return "#{baseUrl.scheme}://#{baseUrl.host}#{url}" if (url[0].chr == "/")
baseUrl.to_s.gsub(/(.*)\/.*$/, '\1') + "/" + url
end
# Find new unique file name adding an offset to the original
def findNewName(file)
return file unless file
offset = 1
loop do
break if !File.exist?("#{file}-#{offset}")
offset += 1
end
"#{file}-#{++offset}"
end
# Rename file using sed-type regexp (s/foo/bar/)
# \1 ..\n can be used to interpolate as usual
# \C is replaced by current file count
def rename(fileName, config)
rx = config.localRename
pattern = rx.gsub(/^s\/(.*?)\/(.*)\/$/, '\1')
replcmt = $2
# Chances are you want this; when matching against '.*' ruby 1.8 has
# a strange behaviour on regexes. *sigh* Perl would be prettier there :p
pattern = '^.*$' if ( pattern == '.*' )
replcmt.gsub!(/\\C/, config.fileCount.to_s) if ( replcmt.include? "\\C" )
fileName.gsub(/#{pattern}/, replcmt)
end
# Download a file, showing a text progress bar.
def downloadFile(url, config)
localName = url.gsub(/.*\//, '')
localName = rename(localName, config) if ( config.localRename )
localName = findNewName(localName) if ( File.exist? localName )
localFile = open(localName, 'w')
pbar = nil
config.fileCount += 1
remoteFile = open(url,
"User-Agent" => config.userAgent,
"Referer" => (config.referer) ? config.baseUrl.to_s : "",
:content_length_proc => lambda { |t|
if t && 0 < t
pbar = ProgressBar.new(localName, t)
pbar.file_transfer_mode
end
},
:progress_proc => lambda { |s|
pbar.set s if pbar
})
localFile.write(remoteFile.collect)
localFile.close
pbar.finish
rescue Exception
STDERR.puts "Can't download file: #{localName}\n" + $!
end
# ----------------------------------------------------------------------[ core ]
# Set config from cmd line
config = Config.new
config.parse_cmdline
# Parse, and print/download
data = fetch(config.baseUrl, config)
if data
parser = DumbHTMLParser.new(config.match)
parser.feed(data)
parser.close
config.tags.each do |tag|
meth = parser.method(tag)
meth.call.each do |url|
absUrl = url
if (!config.keepRelative) && (url !~ /^(ftp|https?):\/\//)
absUrl = getAbsUrl(url, config.baseUrl)
end
puts absUrl if not config.download
downloadFile(absUrl, config) if config.download
sleep config.sleep_time if config.sleep_time
end
end
end