Class WWW::Mechanize
In: lib/mechanize/cookie.rb
lib/mechanize/errors.rb
lib/mechanize/form.rb
lib/mechanize/form_elements.rb
lib/mechanize/history.rb
lib/mechanize/list.rb
lib/mechanize/page.rb
lib/mechanize/page_elements.rb
lib/mechanize/pluggable_parsers.rb
lib/mechanize.rb
Parent: Object
Mechanize\n[lib/mechanize.rb\nlib/mechanize/cookie.rb\nlib/mechanize/errors.rb\nlib/mechanize/form.rb\nlib/mechanize/form_elements.rb\nlib/mechanize/history.rb\nlib/mechanize/list.rb\nlib/mechanize/page.rb\nlib/mechanize/page_elements.rb\nlib/mechanize/pluggable_parsers.rb] lib/mechanize.rb WWW dot/m_19_0.png

Synopsis

The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.

Example

 require 'rubygems'
 require 'mechanize'
 require 'logger'

 agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
 agent.user_agent_alias = 'Mac Safari'
 page = agent.get("http://www.google.com/")
 search_form = page.forms.name("f").first
 search_form.fields.name("q").value = "Hello"
 search_results = agent.submit(search_form)
 puts search_results.body

Methods

Classes and Modules

Class WWW::Mechanize::Base
Class WWW::Mechanize::Button
Class WWW::Mechanize::CheckBox
Class WWW::Mechanize::ContentTypeError
Class WWW::Mechanize::Cookie
Class WWW::Mechanize::CookieJar
Class WWW::Mechanize::Field
Class WWW::Mechanize::File
Class WWW::Mechanize::FileSaver
Class WWW::Mechanize::FileUpload
Class WWW::Mechanize::Form
Class WWW::Mechanize::Frame
Class WWW::Mechanize::GlobalForm
Class WWW::Mechanize::Headers
Class WWW::Mechanize::History
Class WWW::Mechanize::ImageButton
Class WWW::Mechanize::Link
Class WWW::Mechanize::List
Class WWW::Mechanize::Meta
Class WWW::Mechanize::MultiSelectList
Class WWW::Mechanize::Option
Class WWW::Mechanize::Page
Class WWW::Mechanize::PluggableParser
Class WWW::Mechanize::REXMLPage
Class WWW::Mechanize::RadioButton
Class WWW::Mechanize::ResponseCodeError
Class WWW::Mechanize::SelectList

Constants

VERSION = '0.6.11'   The version of Mechanize you are using.
AGENT_ALIASES = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"   User Agent aliases
CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))

External Aliases

redirect_ok -> follow_redirect?

Attributes

ca_file  [RW] 
cert  [RW] 
conditional_requests  [RW] 
cookie_jar  [RW] 
follow_meta_refresh  [RW] 
history  [R] 
keep_alive  [RW] 
keep_alive_time  [RW] 
key  [RW] 
log  [RW] 
open_timeout  [RW] 
pass  [RW] 
pluggable_parser  [R] 
read_timeout  [RW] 
redirect_ok  [RW] 
user_agent  [RW] 
watch_for_set  [RW] 

Public Class methods

[Source]

     # File lib/mechanize.rb, line 106
106:   def initialize
107:     # attr_accessors
108:     @cookie_jar     = CookieJar.new
109:     @log            = nil
110:     @open_timeout   = nil
111:     @read_timeout   = nil
112:     @user_agent     = AGENT_ALIASES['Mechanize']
113:     @watch_for_set  = nil
114:     @ca_file        = nil
115:     @cert           = nil # OpenSSL Certificate
116:     @key            = nil # OpenSSL Private Key
117:     @pass           = nil # OpenSSL Password
118:     @redirect_ok    = true # Should we follow redirects?
119:     
120:     # attr_readers
121:     @history        = WWW::Mechanize::History.new
122:     @pluggable_parser = PluggableParser.new
123: 
124:     # Auth variables
125:     @user           = nil # Auth User
126:     @password       = nil # Auth Password
127:     @digest         = nil # DigestAuth Digest
128:     @auth_hash      = {}  # Keep track of urls for sending auth
129: 
130:     # Proxy settings
131:     @proxy_addr     = nil
132:     @proxy_pass     = nil
133:     @proxy_port     = nil
134:     @proxy_user     = nil
135: 
136:     @conditional_requests = true
137: 
138:     @follow_meta_refresh  = false
139: 
140:     # Connection Cache & Keep alive
141:     @connection_cache = {}
142:     @keep_alive_time  = 300
143:     @keep_alive       = true
144: 
145:     yield self if block_given?
146:   end

Private Class methods

[Source]

     # File lib/mechanize.rb, line 622
622:   def self.build_query_string(parameters)
623:     vals = [] 
624:     parameters.each { |k,v|
625:       next if k.nil?
626:       vals <<
627:       [WEBrick::HTTPUtils.escape_form(k), 
628:        WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
629:     }
630: 
631:     vals.join("&")
632:   end

Public Instance methods

[Source]

     # File lib/mechanize.rb, line 172
172:   def auth(user, password)
173:     @user       = user
174:     @password   = password
175:   end

Equivalent to the browser back button. Returns the most recent page visited.

[Source]

     # File lib/mechanize.rb, line 214
214:   def back
215:     @history.pop
216:   end

Sets the user and password to be used for basic authentication.

[Source]

     # File lib/mechanize.rb, line 168
168:   def basic_auth(user, password)
169:     auth(user, password)
170:   end

Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.

[Source]

     # File lib/mechanize.rb, line 198
198:   def click(link)
199:     referer =
200:       begin
201:         link.page
202:       rescue
203:         nil
204:       end
205:     uri = to_absolute_uri(
206:       link.attributes['href'] || link.attributes['src'] || link.href,
207:       referer || current_page()
208:     )
209:     get(uri, referer)
210:   end

Returns a list of cookies stored in the cookie jar.

[Source]

     # File lib/mechanize.rb, line 163
163:   def cookies
164:     @cookie_jar.to_a
165:   end

Returns the current page loaded by Mechanize

[Source]

     # File lib/mechanize.rb, line 257
257:   def current_page
258:     @history.last
259:   end

Fetches the URL passed in and returns a page.

[Source]

     # File lib/mechanize.rb, line 178
178:   def get(url, referer=nil, &block)
179:     cur_page = referer || current_page ||
180:                     Page.new( nil, {'content-type'=>'text/html'})
181: 
182:     # fetch the page
183:     abs_uri = to_absolute_uri(url, cur_page)
184:     request = fetch_request(abs_uri)
185:     page = fetch_page(abs_uri, request, cur_page, &block)
186:     add_to_history(page)
187:     page
188:   end

Fetch a file and return the contents of the file.

[Source]

     # File lib/mechanize.rb, line 191
191:   def get_file(url)
192:     get(url).body
193:   end

[Source]

     # File lib/mechanize.rb, line 149
149:   def max_history; @history.max_size; end

[Source]

     # File lib/mechanize.rb, line 148
148:   def max_history=(length); @history.max_size = length; end
page()

Alias for current_page

Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:

 agent.post('http://example.com/', "foo" => "bar")

or

 agent.post('http://example.com/', [ ["foo", "bar"] ])

[Source]

     # File lib/mechanize.rb, line 224
224:   def post(url, query={})
225:     node = Hpricot::Elem.new(Hpricot::STag.new('form'))
226:     node['method'] = 'POST'
227:     node['enctype'] = 'application/x-www-form-urlencoded'
228: 
229:     form = Form.new(node)
230:     query.each { |k,v|
231:       form.fields << Field.new(k,v)
232:     }
233:     post_form(url, form)
234:   end

Sets the proxy address, port, user, and password

[Source]

     # File lib/mechanize.rb, line 152
152:   def set_proxy(addr, port, user = nil, pass = nil)
153:     @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
154:   end

Submit a form with an optional button. Without a button:

 page = agent.get('http://example.com')
 agent.submit(page.forms.first)

With a button

 agent.submit(page.forms.first, page.forms.first.buttons.first)

[Source]

     # File lib/mechanize.rb, line 242
242:   def submit(form, button=nil)
243:     form.add_button_to_query(button) if button
244:     uri = to_absolute_uri(form.action, form.page)
245:     case form.method.upcase
246:     when 'POST'
247:       post_form(uri, form) 
248:     when 'GET'
249:       uri.query = WWW::Mechanize.build_query_string(form.build_query)
250:       get(uri)
251:     else
252:       raise "unsupported method: #{form.method.upcase}"
253:     end
254:   end

Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.

[Source]

     # File lib/mechanize.rb, line 276
276:   def transact
277:     history_backup = @history.dup
278:     begin
279:       yield self
280:     ensure
281:       @history = history_backup
282:     end
283:   end

Set the user agent for the Mechanize object. See AGENT_ALIASES

[Source]

     # File lib/mechanize.rb, line 158
158:   def user_agent_alias=(al)
159:     self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
160:   end

Returns whether or not a url has been visited

[Source]

     # File lib/mechanize.rb, line 262
262:   def visited?(url)
263:     ! visited_page(url).nil?
264:   end

Returns a visited page for the url passed in, otherwise nil

[Source]

     # File lib/mechanize.rb, line 267
267:   def visited_page(url)
268:     if url.respond_to? :href
269:       url = url.href
270:     end
271:     @history.visited_page(to_absolute_uri(url))
272:   end

Protected Instance methods

[Source]

     # File lib/mechanize.rb, line 339
339:   def gen_auth_header(uri, request, auth_header, is_IIS = false)
340:     @@nonce_count += 1
341: 
342:     user = @digest_user
343:     password = @digest_password
344: 
345:     auth_header =~ /^(\w+) (.*)/
346: 
347:     params = {}
348:     $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
349: 
350:     a_1 = "#{@user}:#{params['realm']}:#{@password}"
351:     a_2 = "#{request.method}:#{uri.path}"
352:     request_digest = ''
353:     request_digest << Digest::MD5.hexdigest(a_1)
354:     request_digest << ':' << params['nonce']
355:     request_digest << ':' << ('%08x' % @@nonce_count)
356:     request_digest << ':' << CNONCE
357:     request_digest << ':' << params['qop']
358:     request_digest << ':' << Digest::MD5.hexdigest(a_2)
359: 
360:     header = ''
361:     header << "Digest username=\"#{@user}\", "
362:     header << "realm=\"#{params['realm']}\", "
363:     if is_IIS then
364:       header << "qop=\"#{params['qop']}\", "
365:     else
366:       header << "qop=#{params['qop']}, "
367:     end
368:     header << "uri=\"#{uri.path}\", "
369:     header << "algorithm=MD5, "
370:     header << "nonce=\"#{params['nonce']}\", "
371:     header << "nc=#{'%08x' % @@nonce_count}, "
372:     header << "cnonce=\"#{CNONCE}\", "
373:     header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
374: 
375:     return header
376:   end

[Source]

     # File lib/mechanize.rb, line 288
288:   def set_headers(uri, request, cur_page)
289:     if @keep_alive
290:       request.add_field('Connection', 'keep-alive')
291:       request.add_field('Keep-Alive', keep_alive_time.to_s)
292:     else
293:       request.add_field('Connection', 'close')
294:     end
295:     request.add_field('Accept-Encoding', 'gzip,identity')
296:     request.add_field('Accept-Language', 'en-us,en;q0.5')
297:     request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
298: 
299:     unless @cookie_jar.empty?(uri)
300:       cookies = @cookie_jar.cookies(uri)
301:       cookie = cookies.length > 0 ? cookies.join("; ") : nil
302:       if log
303:         cookies.each do |c|
304:           log.debug("using cookie: #{c}")
305:         end
306:       end
307:       request.add_field('Cookie', cookie)
308:     end
309: 
310:     # Add Referer header to request
311:     unless cur_page.uri.nil?
312:       request.add_field('Referer', cur_page.uri.to_s)
313:     end
314: 
315:     # Add User-Agent header to request
316:     request.add_field('User-Agent', @user_agent) if @user_agent 
317: 
318:     # Add If-Modified-Since if page is in history
319:     if @conditional_requests
320:       if( (page = visited_page(uri)) && page.response['Last-Modified'] )
321:         request.add_field('If-Modified-Since', page.response['Last-Modified'])
322:       end
323:     end
324: 
325:     if( @auth_hash[uri.host] )
326:       case @auth_hash[uri.host]
327:       when :basic
328:         request.basic_auth(@user, @password)
329:       when :digest
330:         @digest_response ||= nil
331:         @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
332:         request.add_field('Authorization', @digest_response) if @digest_response
333:       end
334:     end
335: 
336:     request
337:   end

Private Instance methods

[Source]

     # File lib/mechanize.rb, line 634
634:   def add_to_history(page)
635:     @history.push(page, to_absolute_uri(page.uri))
636:   end

uri is an absolute URI

[Source]

     # File lib/mechanize.rb, line 444
444:   def fetch_page(uri, request, cur_page=current_page(), request_data=[])
445:     raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
446: 
447:     log.info("#{ request.class }: #{ request.path }") if log
448: 
449:     page = nil
450: 
451:     cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
452:       :connection         => nil,
453:       :keep_alive_options => {},
454:     })
455:     http_obj = cache_obj[:connection]
456:     if http_obj.nil? || ! http_obj.started?
457:       http_obj = cache_obj[:connection] =
458:           Net::HTTP.new( uri.host,
459:                   uri.port,
460:                   @proxy_addr,
461:                   @proxy_port,
462:                   @proxy_user,
463:                   @proxy_pass
464:                 )
465:       cache_obj[:keep_alive_options] = {}
466: 
467:       # Specify timeouts if given
468:       http_obj.open_timeout = @open_timeout if @open_timeout
469:       http_obj.read_timeout = @read_timeout if @read_timeout
470:     end
471: 
472:     if uri.scheme == 'https' && ! http_obj.started?
473:       http_obj.use_ssl = true
474:       http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
475:       if @ca_file
476:         http_obj.ca_file = @ca_file
477:         http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
478:       end
479:       if @cert && @key
480:         http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
481:         http_obj.key  = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
482:       end
483:     end
484: 
485:     # If we're keeping connections alive and the last request time is too
486:     # long ago, stop the connection.  Or, if the max requests left is 1,
487:     # reset the connection.
488:     if @keep_alive && http_obj.started?
489:       opts = cache_obj[:keep_alive_options]
490:       if((opts[:timeout] &&
491:          Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
492:           opts[:max] && opts[:max].to_i == 1)
493: 
494:         log.debug('Finishing stale connection') if log
495:         http_obj.finish
496: 
497:       end
498:     end
499: 
500:     http_obj.start unless http_obj.started?
501: 
502:     request = set_headers(uri, request, cur_page)
503: 
504:     # Log specified headers for the request
505:     if log
506:       request.each_header do |k, v|
507:         log.debug("request-header: #{ k } => #{ v }")
508:       end
509:     end
510: 
511:     cache_obj[:last_request_time] = Time.now.to_i
512: 
513:     # Send the request
514:     response = http_obj.request(request, *request_data) {|response|
515: 
516:       body = StringIO.new
517:       total = 0
518:       response.read_body { |part|
519:         total += part.length
520:         body.write(part)
521:         log.debug("Read #{total} bytes") if log
522:       }
523:       body.rewind
524: 
525:       response.each_header { |k,v|
526:         log.debug("response-header: #{ k } => #{ v }")
527:       } if log
528: 
529:       content_type = nil
530:       unless response['Content-Type'].nil?
531:         data = response['Content-Type'].match(/^([^;]*)/)
532:         content_type = data[1].downcase unless data.nil?
533:       end
534: 
535:       response_body = 
536:       if encoding = response['Content-Encoding']
537:         case encoding.downcase
538:         when 'gzip'
539:           log.debug('gunzip body') if log
540:           Zlib::GzipReader.new(body).read
541:         when 'x-gzip'
542:           body.read
543:         else
544:           raise 'Unsupported content encoding'
545:         end
546:       else
547:         body.read
548:       end
549: 
550:       # Find our pluggable parser
551:       page = @pluggable_parser.parser(content_type).new(
552:         uri,
553:         response,
554:         response_body,
555:         response.code
556:       ) { |parser|
557:         parser.mech = self if parser.respond_to? :mech=
558:         if parser.respond_to?(:watch_for_set=) && @watch_for_set
559:           parser.watch_for_set = @watch_for_set
560:         end
561:       }
562: 
563:     }
564: 
565:     # If the server sends back keep alive options, save them
566:     if keep_alive_info = response['keep-alive']
567:       keep_alive_info.split(/,\s*/).each do |option|
568:         k, v = option.split(/=/)
569:         cache_obj[:keep_alive_options] ||= {}
570:         cache_obj[:keep_alive_options][k.intern] = v
571:       end
572:     end
573: 
574:     (response.get_fields('Set-Cookie')||[]).each do |cookie|
575:       Cookie::parse(uri, cookie, log) { |c|
576:         log.debug("saved cookie: #{c}") if log
577:         @cookie_jar.add(uri, c)
578:       }
579:     end
580: 
581:     log.info("status: #{ page.code }") if log
582: 
583:     res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
584: 
585:     if follow_meta_refresh && page.respond_to?(:meta) &&
586:       (redirect = page.meta.first)
587:       return redirect.click
588:     end
589: 
590:     return page if res_klass <= Net::HTTPSuccess
591: 
592:     if res_klass == Net::HTTPNotModified
593:       log.debug("Got cached page") if log
594:       return visited_page(uri)
595:     elsif res_klass <= Net::HTTPRedirection
596:       return page unless follow_redirect?
597:       log.info("follow redirect to: #{ response['Location'] }") if log
598:       from_uri  = page.uri
599:       abs_uri   = to_absolute_uri(response['Location'].to_s, page)
600:       page = fetch_page(abs_uri, fetch_request(abs_uri), page)
601:       @history.push(page, from_uri)
602:       return page
603:     elsif res_klass <= Net::HTTPUnauthorized
604:       raise ResponseCodeError.new(page) unless @user || @password
605:       raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
606:       if response['www-authenticate'] =~ /Digest/i
607:         @auth_hash[uri.host] = :digest
608:         @digest = response['www-authenticate']
609:       else
610:         @auth_hash[uri.host] = :basic
611:       end
612:       return fetch_page(  uri,
613:                           fetch_request(uri, request.method.downcase.to_sym),
614:                           cur_page,
615:                           request_data
616:                        )
617:     end
618: 
619:     raise ResponseCodeError.new(page), "Unhandled response", caller
620:   end

Creates a new request object based on the scheme and type

[Source]

     # File lib/mechanize.rb, line 434
434:   def fetch_request(uri, type = :get)
435:     raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
436:     if type == :get
437:       Net::HTTP::Get.new(uri.request_uri)
438:     else
439:       Net::HTTP::Post.new(uri.request_uri)
440:     end
441:   end

[Source]

     # File lib/mechanize.rb, line 414
414:   def post_form(url, form)
415:     cur_page = form.page || current_page ||
416:                     Page.new( nil, {'content-type'=>'text/html'})
417: 
418:     request_data = form.request_data
419: 
420:     abs_url = to_absolute_uri(url, cur_page)
421:     request = fetch_request(abs_url, :post)
422:     request.add_field('Content-Type', form.enctype)
423:     request.add_field('Content-Length', request_data.size.to_s)
424: 
425:     log.debug("query: #{ request_data.inspect }") if log
426: 
427:     # fetch the page
428:     page = fetch_page(abs_url, request, cur_page, [request_data])
429:     add_to_history(page) 
430:     page
431:   end

[Source]

     # File lib/mechanize.rb, line 380
380:   def to_absolute_uri(url, cur_page=current_page())
381:     unless url.is_a? URI
382:       url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
383:         sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
384:       }
385: 
386:       url = URI.parse(
387:               Util.html_unescape(
388:                 url.split(/%[0-9A-Fa-f]{2}|#/).zip(
389:                   url.scan(/%[0-9A-Fa-f]{2}|#/)
390:                 ).map { |x,y|
391:                   "#{URI.escape(x)}#{y}"
392:                 }.join('')
393:               )
394:             )
395:     end
396: 
397:     url.path = '/' if url.path.length == 0
398: 
399:     # construct an absolute uri
400:     if url.relative?
401:       raise 'no history. please specify an absolute URL' unless cur_page.uri
402:       base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
403:       url = ((base && base.uri && base.uri.absolute?) ?
404:               base.uri :
405:               cur_page.uri) + url
406:       url = cur_page.uri + url
407:       # Strip initial "/.." bits from the path
408:       url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
409:     end
410: 
411:     return url
412:   end

[Validate]