#!/usr/bin/ruby # NAME # dul.rb - Dumb URI Lister # # SYNOPSIS # # dul.rb # # DESCRIPTION # # dul.rb lists URIs found at a given address provided they # match a certain pattern. The default behavior is to read href # attributes from links, but it can also retrieve # sources, etc. Of course, matching pattern can be changed to # whatever one would think about. # # Matching files may be downloaded, and eventually renamed on the # fly using another ruby-compatible regexp. Informations such as # user-agent or referer can also be (and are) faked. # # This is only handy when you want to GET Apache fancy-indexed # directories and the like, but well it can do a little more. # # Examples: # # Download (-d) all pictures linked at (-b) http://foo/pics/ # using http://foo/pics/ as a fake referer (-r). # # dul.rb -drb http://foo/pics/ # # Download all pictures linked at http://foo/pics/ plus # rename (-R) them locally: foo_pics0.jpg, foo_pics1.png, # foo_pics2.jpg ... # # dul.rb -R 's/.*\.(.*)$/foo_pics\C.\1/' -drb http://foo/pics/ # # Download all text files (-m) linked at http://foo/articles.html # # dul.rb -m 'txt$' -drb http://foo/articles.html # # Download all pictures linked, or shown (look at tags as # well as ) at http://foo/gallery.php # # dul.rb -t a,img -drb http://foo.gallery.php # # IDEAS # # Handle more tags (namely embed, link, script, etc.) # Better regexp support, multiple regexp with -R option ? # Requests based on a given range # Recursive crawling (oh crap) # Cookies (?) # # BUGS # # Ok, I'm just toying with ruby, and this is probably a dirty # mess. So what? :) # # AUTHOR # # oz (@tuxaco dot net) # http://oz.tuxaco.net require 'open-uri' require 'uri' require 'getoptlong' require 'html-parser' require 'formatter' require 'progressbar' class Config attr_reader :baseUrl, :match, :http_proxy, :userAgent, :referer, :tags, :keepRelative, :download, :localRename, :sleep_time attr_accessor :fileCount def initialize @baseUrl = '' @match = '(png|jpg|jpeg|gif)$' @http_proxy = ENV['http_proxy'] @userAgent = 'Mozilla/5.0 (Windows i686) Gecko/20050517 Firefox/1.0.4'; @referer = false @localRename = nil @tags = [ 'a' ] @keepRelative = false @download = false @fileCount = 0 @sleep_time = 0 end def usage STDERR.print File.basename($0), " \n", " -h, --help, Show help and exit\n", " -b, --base-url , Set base URL\n", " -d, --download, Download files\n", " -i, --interval , Wait n seconds between downloads\n", " -m, --match , Change file match expression\n", " -R, --rename', Rename local file with regex\n", " -t, --tag , Interesting tags\n", " -u, --user-agent , Change User-agent\n", " -r, --referer', Use base URL a referer\n", " -k, --keep-rel', Keep relative URIs\n" end def parse_cmdline opt_parser = GetoptLong.new opt_parser.set_options( [ '--help', '-h', GetoptLong::NO_ARGUMENT ], [ '--download', '-d', GetoptLong::NO_ARGUMENT ], [ '--interval', '-i', GetoptLong::REQUIRED_ARGUMENT ], [ '--referer', '-r', GetoptLong::NO_ARGUMENT ], [ '--keep-rel', '-k', GetoptLong::NO_ARGUMENT ], [ '--base-url', '-b', GetoptLong::REQUIRED_ARGUMENT ], [ '--match', '-m', GetoptLong::REQUIRED_ARGUMENT ], [ '--user-agent', '-u', GetoptLong::REQUIRED_ARGUMENT ], [ '--tags', '-t', GetoptLong::REQUIRED_ARGUMENT ], [ '--rename', '-R', GetoptLong::REQUIRED_ARGUMENT ] ) begin opt_parser.each_option do |optname, optargs| case optname when '--help' usage exit 1 when '--base-url' if optargs =~ /^https?:\/\/.+/ @baseUrl = URI.parse(optargs) else STDERR.puts "Invalid URL: #{optargs}" usage exit 128 end when '--download' @download = true when '--tags' optargs = optargs.gsub /\s*/, '' if optargs =~ /^(a,?|img,?)+$/ @tags = optargs.split ',' else STDERR.puts "Invalid tags: #{optargs}" exit 42 end when '--match' @match = (optargs =~ /^\/.*\/$/) ? optargs[1..-2] : optargs when '--rename' if (optargs !~ /^s\/.*?\/.*?\/$/) STDERR.puts "Invalid rename expression: #{optargs}" exit 43 else @localRename = optargs end when '--user-agent' @userAgent = optargs when '--interval' if (optargs !~ /^[0-9]+$/) STDERR.puts "Invalid interval: #{optargs}" exit 44 end @sleep_time = optargs when '--referer' @referer = true when '--keep-rel' @keepRelative = true end end rescue GetoptLong::AmbigousOption, GetoptLong::NeedlessArgument, GetoptLong::MissingArgument, GetoptLong::InvalidOption usage exit 1 end if @baseUrl == '' STDERR.puts "I need a base URL.\n" usage exit 128 end end end # Local HTML Parser class DumbHTMLParser < HTMLParser attr_reader :a, :img, :match def initialize(regex) super(NullFormatter.new) @img = [] @a = [] @match = regex end def do_img(attrs) for attrname, value in attrs if attrname == 'src' # no quotes value = value[1..-2] @img.push(value) if value.match(/#{@match}/i) end end end def start_a(attrs) for attrname, value in attrs if attrname == 'href' value = value[1..-2] @a.push(value) if value.match(/#{@match}/i) end end end end # ------------------------------------------------------------------[ helpers ] # get URI content def fetch(uri, config) open(uri, "User-Agent" => config.userAgent, "Referer" => (config.referer) ? config.baseUrl.to_s : "").collect.to_s rescue Exception STDERR.puts "Can't fetch: #{uri}\n" + $! end # Return absolute form of url def getAbsUrl(url, baseUrl) return "#{baseUrl.scheme}://#{baseUrl.host}#{url}" if (url[0].chr == "/") baseUrl.to_s.gsub(/(.*)\/.*$/, '\1') + "/" + url end # Find new unique file name adding an offset to the original def findNewName(file) return file unless file offset = 1 loop do break if !File.exist?("#{file}-#{offset}") offset += 1 end "#{file}-#{++offset}" end # Rename file using sed-type regexp (s/foo/bar/) # \1 ..\n can be used to interpolate as usual # \C is replaced by current file count def rename(fileName, config) rx = config.localRename pattern = rx.gsub(/^s\/(.*?)\/(.*)\/$/, '\1') replcmt = $2 # Chances are you want this; when matching against '.*' ruby 1.8 has # a strange behaviour on regexes. *sigh* Perl would be prettier there :p pattern = '^.*$' if ( pattern == '.*' ) replcmt.gsub!(/\\C/, config.fileCount.to_s) if ( replcmt.include? "\\C" ) fileName.gsub(/#{pattern}/, replcmt) end # Download a file, showing a text progress bar. def downloadFile(url, config) localName = url.gsub(/.*\//, '') localName = rename(localName, config) if ( config.localRename ) localName = findNewName(localName) if ( File.exist? localName ) localFile = open(localName, 'w') pbar = nil config.fileCount += 1 remoteFile = open(url, "User-Agent" => config.userAgent, "Referer" => (config.referer) ? config.baseUrl.to_s : "", :content_length_proc => lambda { |t| if t && 0 < t pbar = ProgressBar.new(localName, t) pbar.file_transfer_mode end }, :progress_proc => lambda { |s| pbar.set s if pbar }) localFile.write(remoteFile.collect) localFile.close pbar.finish rescue Exception STDERR.puts "Can't download file: #{localName}\n" + $! end # ----------------------------------------------------------------------[ core ] # Set config from cmd line config = Config.new config.parse_cmdline # Parse, and print/download data = fetch(config.baseUrl, config) if data parser = DumbHTMLParser.new(config.match) parser.feed(data) parser.close config.tags.each do |tag| meth = parser.method(tag) meth.call.each do |url| absUrl = url if (!config.keepRelative) && (url !~ /^(ftp|https?):\/\//) absUrl = getAbsUrl(url, config.baseUrl) end puts absUrl if not config.download downloadFile(absUrl, config) if config.download sleep config.sleep_time if config.sleep_time end end end