The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems'
require 'mechanize'
require 'logger'
agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
agent.user_agent_alias = 'Mac Safari'
page = agent.get("http://www.google.com/")
search_form = page.forms.name("f").first
search_form.fields.name("q").value = "Hello"
search_results = agent.submit(search_form)
puts search_results.body
| VERSION | = | '0.6.11' | The version of Mechanize you are using. | |
| AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases | |
| CNONCE | = | Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535))) |
| redirect_ok | -> | follow_redirect? |
| ca_file | [RW] | |
| cert | [RW] | |
| conditional_requests | [RW] | |
| cookie_jar | [RW] | |
| follow_meta_refresh | [RW] | |
| history | [R] | |
| keep_alive | [RW] | |
| keep_alive_time | [RW] | |
| key | [RW] | |
| log | [RW] | |
| open_timeout | [RW] | |
| pass | [RW] | |
| pluggable_parser | [R] | |
| read_timeout | [RW] | |
| redirect_ok | [RW] | |
| user_agent | [RW] | |
| watch_for_set | [RW] |
# File lib/mechanize.rb, line 106
106: def initialize
107: # attr_accessors
108: @cookie_jar = CookieJar.new
109: @log = nil
110: @open_timeout = nil
111: @read_timeout = nil
112: @user_agent = AGENT_ALIASES['Mechanize']
113: @watch_for_set = nil
114: @ca_file = nil
115: @cert = nil # OpenSSL Certificate
116: @key = nil # OpenSSL Private Key
117: @pass = nil # OpenSSL Password
118: @redirect_ok = true # Should we follow redirects?
119:
120: # attr_readers
121: @history = WWW::Mechanize::History.new
122: @pluggable_parser = PluggableParser.new
123:
124: # Auth variables
125: @user = nil # Auth User
126: @password = nil # Auth Password
127: @digest = nil # DigestAuth Digest
128: @auth_hash = {} # Keep track of urls for sending auth
129:
130: # Proxy settings
131: @proxy_addr = nil
132: @proxy_pass = nil
133: @proxy_port = nil
134: @proxy_user = nil
135:
136: @conditional_requests = true
137:
138: @follow_meta_refresh = false
139:
140: # Connection Cache & Keep alive
141: @connection_cache = {}
142: @keep_alive_time = 300
143: @keep_alive = true
144:
145: yield self if block_given?
146: end
# File lib/mechanize.rb, line 622
622: def self.build_query_string(parameters)
623: vals = []
624: parameters.each { |k,v|
625: next if k.nil?
626: vals <<
627: [WEBrick::HTTPUtils.escape_form(k),
628: WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
629: }
630:
631: vals.join("&")
632: end
# File lib/mechanize.rb, line 172
172: def auth(user, password)
173: @user = user
174: @password = password
175: end
Sets the user and password to be used for basic authentication.
# File lib/mechanize.rb, line 168
168: def basic_auth(user, password)
169: auth(user, password)
170: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/mechanize.rb, line 198
198: def click(link)
199: referer =
200: begin
201: link.page
202: rescue
203: nil
204: end
205: uri = to_absolute_uri(
206: link.attributes['href'] || link.attributes['src'] || link.href,
207: referer || current_page()
208: )
209: get(uri, referer)
210: end
Fetches the URL passed in and returns a page.
# File lib/mechanize.rb, line 178
178: def get(url, referer=nil, &block)
179: cur_page = referer || current_page ||
180: Page.new( nil, {'content-type'=>'text/html'})
181:
182: # fetch the page
183: abs_uri = to_absolute_uri(url, cur_page)
184: request = fetch_request(abs_uri)
185: page = fetch_page(abs_uri, request, cur_page, &block)
186: add_to_history(page)
187: page
188: end
Fetch a file and return the contents of the file.
# File lib/mechanize.rb, line 191
191: def get_file(url)
192: get(url).body
193: end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/mechanize.rb, line 224
224: def post(url, query={})
225: node = Hpricot::Elem.new(Hpricot::STag.new('form'))
226: node['method'] = 'POST'
227: node['enctype'] = 'application/x-www-form-urlencoded'
228:
229: form = Form.new(node)
230: query.each { |k,v|
231: form.fields << Field.new(k,v)
232: }
233: post_form(url, form)
234: end
Sets the proxy address, port, user, and password
# File lib/mechanize.rb, line 152
152: def set_proxy(addr, port, user = nil, pass = nil)
153: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
154: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com')
agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/mechanize.rb, line 242
242: def submit(form, button=nil)
243: form.add_button_to_query(button) if button
244: uri = to_absolute_uri(form.action, form.page)
245: case form.method.upcase
246: when 'POST'
247: post_form(uri, form)
248: when 'GET'
249: uri.query = WWW::Mechanize.build_query_string(form.build_query)
250: get(uri)
251: else
252: raise "unsupported method: #{form.method.upcase}"
253: end
254: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/mechanize.rb, line 276
276: def transact
277: history_backup = @history.dup
278: begin
279: yield self
280: ensure
281: @history = history_backup
282: end
283: end
Returns whether or not a url has been visited
# File lib/mechanize.rb, line 262
262: def visited?(url)
263: ! visited_page(url).nil?
264: end
# File lib/mechanize.rb, line 339
339: def gen_auth_header(uri, request, auth_header, is_IIS = false)
340: @@nonce_count += 1
341:
342: user = @digest_user
343: password = @digest_password
344:
345: auth_header =~ /^(\w+) (.*)/
346:
347: params = {}
348: $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
349:
350: a_1 = "#{@user}:#{params['realm']}:#{@password}"
351: a_2 = "#{request.method}:#{uri.path}"
352: request_digest = ''
353: request_digest << Digest::MD5.hexdigest(a_1)
354: request_digest << ':' << params['nonce']
355: request_digest << ':' << ('%08x' % @@nonce_count)
356: request_digest << ':' << CNONCE
357: request_digest << ':' << params['qop']
358: request_digest << ':' << Digest::MD5.hexdigest(a_2)
359:
360: header = ''
361: header << "Digest username=\"#{@user}\", "
362: header << "realm=\"#{params['realm']}\", "
363: if is_IIS then
364: header << "qop=\"#{params['qop']}\", "
365: else
366: header << "qop=#{params['qop']}, "
367: end
368: header << "uri=\"#{uri.path}\", "
369: header << "algorithm=MD5, "
370: header << "nonce=\"#{params['nonce']}\", "
371: header << "nc=#{'%08x' % @@nonce_count}, "
372: header << "cnonce=\"#{CNONCE}\", "
373: header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
374:
375: return header
376: end
# File lib/mechanize.rb, line 288
288: def set_headers(uri, request, cur_page)
289: if @keep_alive
290: request.add_field('Connection', 'keep-alive')
291: request.add_field('Keep-Alive', keep_alive_time.to_s)
292: else
293: request.add_field('Connection', 'close')
294: end
295: request.add_field('Accept-Encoding', 'gzip,identity')
296: request.add_field('Accept-Language', 'en-us,en;q0.5')
297: request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
298:
299: unless @cookie_jar.empty?(uri)
300: cookies = @cookie_jar.cookies(uri)
301: cookie = cookies.length > 0 ? cookies.join("; ") : nil
302: if log
303: cookies.each do |c|
304: log.debug("using cookie: #{c}")
305: end
306: end
307: request.add_field('Cookie', cookie)
308: end
309:
310: # Add Referer header to request
311: unless cur_page.uri.nil?
312: request.add_field('Referer', cur_page.uri.to_s)
313: end
314:
315: # Add User-Agent header to request
316: request.add_field('User-Agent', @user_agent) if @user_agent
317:
318: # Add If-Modified-Since if page is in history
319: if @conditional_requests
320: if( (page = visited_page(uri)) && page.response['Last-Modified'] )
321: request.add_field('If-Modified-Since', page.response['Last-Modified'])
322: end
323: end
324:
325: if( @auth_hash[uri.host] )
326: case @auth_hash[uri.host]
327: when :basic
328: request.basic_auth(@user, @password)
329: when :digest
330: @digest_response ||= nil
331: @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
332: request.add_field('Authorization', @digest_response) if @digest_response
333: end
334: end
335:
336: request
337: end
# File lib/mechanize.rb, line 634
634: def add_to_history(page)
635: @history.push(page, to_absolute_uri(page.uri))
636: end
uri is an absolute URI
# File lib/mechanize.rb, line 444
444: def fetch_page(uri, request, cur_page=current_page(), request_data=[])
445: raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
446:
447: log.info("#{ request.class }: #{ request.path }") if log
448:
449: page = nil
450:
451: cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
452: :connection => nil,
453: :keep_alive_options => {},
454: })
455: http_obj = cache_obj[:connection]
456: if http_obj.nil? || ! http_obj.started?
457: http_obj = cache_obj[:connection] =
458: Net::HTTP.new( uri.host,
459: uri.port,
460: @proxy_addr,
461: @proxy_port,
462: @proxy_user,
463: @proxy_pass
464: )
465: cache_obj[:keep_alive_options] = {}
466:
467: # Specify timeouts if given
468: http_obj.open_timeout = @open_timeout if @open_timeout
469: http_obj.read_timeout = @read_timeout if @read_timeout
470: end
471:
472: if uri.scheme == 'https' && ! http_obj.started?
473: http_obj.use_ssl = true
474: http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
475: if @ca_file
476: http_obj.ca_file = @ca_file
477: http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
478: end
479: if @cert && @key
480: http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
481: http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
482: end
483: end
484:
485: # If we're keeping connections alive and the last request time is too
486: # long ago, stop the connection. Or, if the max requests left is 1,
487: # reset the connection.
488: if @keep_alive && http_obj.started?
489: opts = cache_obj[:keep_alive_options]
490: if((opts[:timeout] &&
491: Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
492: opts[:max] && opts[:max].to_i == 1)
493:
494: log.debug('Finishing stale connection') if log
495: http_obj.finish
496:
497: end
498: end
499:
500: http_obj.start unless http_obj.started?
501:
502: request = set_headers(uri, request, cur_page)
503:
504: # Log specified headers for the request
505: if log
506: request.each_header do |k, v|
507: log.debug("request-header: #{ k } => #{ v }")
508: end
509: end
510:
511: cache_obj[:last_request_time] = Time.now.to_i
512:
513: # Send the request
514: response = http_obj.request(request, *request_data) {|response|
515:
516: body = StringIO.new
517: total = 0
518: response.read_body { |part|
519: total += part.length
520: body.write(part)
521: log.debug("Read #{total} bytes") if log
522: }
523: body.rewind
524:
525: response.each_header { |k,v|
526: log.debug("response-header: #{ k } => #{ v }")
527: } if log
528:
529: content_type = nil
530: unless response['Content-Type'].nil?
531: data = response['Content-Type'].match(/^([^;]*)/)
532: content_type = data[1].downcase unless data.nil?
533: end
534:
535: response_body =
536: if encoding = response['Content-Encoding']
537: case encoding.downcase
538: when 'gzip'
539: log.debug('gunzip body') if log
540: Zlib::GzipReader.new(body).read
541: when 'x-gzip'
542: body.read
543: else
544: raise 'Unsupported content encoding'
545: end
546: else
547: body.read
548: end
549:
550: # Find our pluggable parser
551: page = @pluggable_parser.parser(content_type).new(
552: uri,
553: response,
554: response_body,
555: response.code
556: ) { |parser|
557: parser.mech = self if parser.respond_to? :mech=
558: if parser.respond_to?(:watch_for_set=) && @watch_for_set
559: parser.watch_for_set = @watch_for_set
560: end
561: }
562:
563: }
564:
565: # If the server sends back keep alive options, save them
566: if keep_alive_info = response['keep-alive']
567: keep_alive_info.split(/,\s*/).each do |option|
568: k, v = option.split(/=/)
569: cache_obj[:keep_alive_options] ||= {}
570: cache_obj[:keep_alive_options][k.intern] = v
571: end
572: end
573:
574: (response.get_fields('Set-Cookie')||[]).each do |cookie|
575: Cookie::parse(uri, cookie, log) { |c|
576: log.debug("saved cookie: #{c}") if log
577: @cookie_jar.add(uri, c)
578: }
579: end
580:
581: log.info("status: #{ page.code }") if log
582:
583: res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
584:
585: if follow_meta_refresh && page.respond_to?(:meta) &&
586: (redirect = page.meta.first)
587: return redirect.click
588: end
589:
590: return page if res_klass <= Net::HTTPSuccess
591:
592: if res_klass == Net::HTTPNotModified
593: log.debug("Got cached page") if log
594: return visited_page(uri)
595: elsif res_klass <= Net::HTTPRedirection
596: return page unless follow_redirect?
597: log.info("follow redirect to: #{ response['Location'] }") if log
598: from_uri = page.uri
599: abs_uri = to_absolute_uri(response['Location'].to_s, page)
600: page = fetch_page(abs_uri, fetch_request(abs_uri), page)
601: @history.push(page, from_uri)
602: return page
603: elsif res_klass <= Net::HTTPUnauthorized
604: raise ResponseCodeError.new(page) unless @user || @password
605: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
606: if response['www-authenticate'] =~ /Digest/i
607: @auth_hash[uri.host] = :digest
608: @digest = response['www-authenticate']
609: else
610: @auth_hash[uri.host] = :basic
611: end
612: return fetch_page( uri,
613: fetch_request(uri, request.method.downcase.to_sym),
614: cur_page,
615: request_data
616: )
617: end
618:
619: raise ResponseCodeError.new(page), "Unhandled response", caller
620: end
Creates a new request object based on the scheme and type
# File lib/mechanize.rb, line 434
434: def fetch_request(uri, type = :get)
435: raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
436: if type == :get
437: Net::HTTP::Get.new(uri.request_uri)
438: else
439: Net::HTTP::Post.new(uri.request_uri)
440: end
441: end
# File lib/mechanize.rb, line 414
414: def post_form(url, form)
415: cur_page = form.page || current_page ||
416: Page.new( nil, {'content-type'=>'text/html'})
417:
418: request_data = form.request_data
419:
420: abs_url = to_absolute_uri(url, cur_page)
421: request = fetch_request(abs_url, :post)
422: request.add_field('Content-Type', form.enctype)
423: request.add_field('Content-Length', request_data.size.to_s)
424:
425: log.debug("query: #{ request_data.inspect }") if log
426:
427: # fetch the page
428: page = fetch_page(abs_url, request, cur_page, [request_data])
429: add_to_history(page)
430: page
431: end
# File lib/mechanize.rb, line 380
380: def to_absolute_uri(url, cur_page=current_page())
381: unless url.is_a? URI
382: url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
383: sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
384: }
385:
386: url = URI.parse(
387: Util.html_unescape(
388: url.split(/%[0-9A-Fa-f]{2}|#/).zip(
389: url.scan(/%[0-9A-Fa-f]{2}|#/)
390: ).map { |x,y|
391: "#{URI.escape(x)}#{y}"
392: }.join('')
393: )
394: )
395: end
396:
397: url.path = '/' if url.path.length == 0
398:
399: # construct an absolute uri
400: if url.relative?
401: raise 'no history. please specify an absolute URL' unless cur_page.uri
402: base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
403: url = ((base && base.uri && base.uri.absolute?) ?
404: base.uri :
405: cur_page.uri) + url
406: url = cur_page.uri + url
407: # Strip initial "/.." bits from the path
408: url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
409: end
410:
411: return url
412: end