The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems'
require 'mechanize'
require 'logger'
agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
agent.user_agent_alias = 'Mac Safari'
page = agent.get("http://www.google.com/")
search_form = page.form_with(:name => "f")
search_form.field_with(:name => "q").value = "Hello"
search_results = agent.submit(search_form)
puts search_results.body
| VERSION | = | '0.9.3' | The version of Mechanize you are using. | |
| AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases |
| redirect_ok | -> | follow_redirect? |
| ca_file | [RW] | |
| cert | [RW] | |
| conditional_requests | [RW] | |
| cookie_jar | [RW] | |
| follow_meta_refresh | [RW] | |
| history | [R] | |
| history_added | [RW] | |
| html_parser | [RW] | The HTML parser to be used when parsing documents |
| html_parser | [RW] | |
| keep_alive | [RW] | |
| keep_alive_time | [RW] | |
| key | [RW] | |
| log | [RW] | |
| open_timeout | [RW] | |
| pass | [RW] | |
| pluggable_parser | [R] | |
| read_timeout | [RW] | |
| redirect_ok | [RW] | |
| redirection_limit | [RW] | |
| request_headers | [RW] | A hash of custom request headers |
| scheme_handlers | [RW] | |
| user_agent | [RW] | |
| verify_callback | [RW] | |
| watch_for_set | [RW] |
# File lib/www/mechanize.rb, line 100
100: def initialize
101: # attr_accessors
102: @cookie_jar = CookieJar.new
103: @log = nil
104: @open_timeout = nil
105: @read_timeout = nil
106: @user_agent = AGENT_ALIASES['Mechanize']
107: @watch_for_set = nil
108: @history_added = nil
109: @ca_file = nil # OpenSSL server certificate file
110:
111: # callback for OpenSSL errors while verifying the server certificate
112: # chain, can be used for debugging or to ignore errors by always
113: # returning _true_
114: @verify_callback = nil
115: @cert = nil # OpenSSL Certificate
116: @key = nil # OpenSSL Private Key
117: @pass = nil # OpenSSL Password
118: @redirect_ok = true # Should we follow redirects?
119:
120: # attr_readers
121: @history = WWW::Mechanize::History.new
122: @pluggable_parser = PluggableParser.new
123:
124: # Auth variables
125: @user = nil # Auth User
126: @password = nil # Auth Password
127: @digest = nil # DigestAuth Digest
128: @auth_hash = {} # Keep track of urls for sending auth
129: @request_headers= {} # A hash of request headers to be used
130:
131: # Proxy settings
132: @proxy_addr = nil
133: @proxy_pass = nil
134: @proxy_port = nil
135: @proxy_user = nil
136:
137: @conditional_requests = true
138:
139: @follow_meta_refresh = false
140: @redirection_limit = 20
141:
142: # Connection Cache & Keep alive
143: @connection_cache = {}
144: @keep_alive_time = 300
145: @keep_alive = true
146:
147: @scheme_handlers = Hash.new { |h,k|
148: h[k] = lambda { |link, page|
149: raise UnsupportedSchemeError.new(k)
150: }
151: }
152: @scheme_handlers['http'] = lambda { |link, page| link }
153: @scheme_handlers['https'] = @scheme_handlers['http']
154: @scheme_handlers['relative'] = @scheme_handlers['http']
155: @scheme_handlers['file'] = @scheme_handlers['http']
156:
157: @pre_connect_hook = Chain::PreConnectHook.new
158: @post_connect_hook = Chain::PostConnectHook.new
159:
160: @html_parser = self.class.html_parser
161:
162: yield self if block_given?
163: end
Sets the user and password to be used for authentication.
# File lib/www/mechanize.rb, line 196
196: def auth(user, password)
197: @user = user
198: @password = password
199: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/www/mechanize.rb, line 292
292: def click(link)
293: referer = link.page rescue referer = nil
294: href = link.respond_to?(:href) ? link.href :
295: (link['href'] || link['src'])
296: get(:url => href, :referer => (referer || current_page()))
297: end
DELETE to url with query_params, and setting options:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 261
261: def delete(url, query_params = {}, options = {})
262: page = head(url, query_params, options.merge({:verb => :delete}))
263: add_to_history(page)
264: page
265: end
Fetches the URL passed in and returns a page.
# File lib/www/mechanize.rb, line 203
203: def get(options, parameters = [], referer = nil)
204: unless options.is_a? Hash
205: url = options
206: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
207: referer = parameters
208: parameters = []
209: end
210: else
211: raise ArgumentError.new("url must be specified") unless url = options[:url]
212: parameters = options[:params] || []
213: referer = options[:referer]
214: headers = options[:headers]
215: end
216:
217: unless referer
218: if url.to_s =~ /^http/
219: referer = Page.new(nil, {'content-type'=>'text/html'})
220: else
221: referer = current_page || Page.new(nil, {'content-type'=>'text/html'})
222: end
223: end
224:
225: # FIXME: Huge hack so that using a URI as a referer works. I need to
226: # refactor everything to pass around URIs but still support
227: # WWW::Mechanize::Page#base
228: unless referer.is_a?(WWW::Mechanize::File)
229: referer = referer.is_a?(String) ?
230: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
231: Page.new(referer, {'content-type' => 'text/html'})
232: end
233:
234: # fetch the page
235: page = fetch_page( :uri => url,
236: :referer => referer,
237: :headers => headers || {},
238: :params => parameters
239: )
240: add_to_history(page)
241: yield page if block_given?
242: page
243: end
Fetch a file and return the contents of the file.
# File lib/www/mechanize.rb, line 286
286: def get_file(url)
287: get(url).body
288: end
HEAD to url with query_params, and setting options:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 272
272: def head(url, query_params = {}, options = {})
273: options = {
274: :uri => url,
275: :headers => {},
276: :params => query_params,
277: :verb => :head
278: }.merge(options)
279: # fetch the page
280: page = fetch_page(options)
281: yield page if block_given?
282: page
283: end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/www/mechanize.rb, line 311
311: def post(url, query={})
312: node = {}
313: # Create a fake form
314: class << node
315: def search(*args); []; end
316: end
317: node['method'] = 'POST'
318: node['enctype'] = 'application/x-www-form-urlencoded'
319:
320: form = Form.new(node)
321: query.each { |k,v|
322: if v.is_a?(IO)
323: form.enctype = 'multipart/form-data'
324: ul = Form::FileUpload.new(k.to_s,::File.basename(v.path))
325: ul.file_data = v.read
326: form.file_uploads << ul
327: else
328: form.fields << Form::Field.new(k.to_s,v)
329: end
330: }
331: post_form(url, form)
332: end
# File lib/www/mechanize.rb, line 174
174: def post_connect_hooks
175: @post_connect_hook.hooks
176: end
# File lib/www/mechanize.rb, line 170
170: def pre_connect_hooks
171: @pre_connect_hook.hooks
172: end
PUT to url with query_params, and setting options:
put('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 250
250: def put(url, query_params = {}, options = {})
251: page = head(url, query_params, options.merge({:verb => :put}))
252: add_to_history(page)
253: page
254: end
Sets the proxy address, port, user, and password addr should be a host, with no "http://"
# File lib/www/mechanize.rb, line 180
180: def set_proxy(addr, port, user = nil, pass = nil)
181: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
182: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com')
agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/www/mechanize.rb, line 340
340: def submit(form, button=nil, headers={})
341: form.add_button_to_query(button) if button
342: case form.method.upcase
343: when 'POST'
344: post_form(form.action, form, headers)
345: when 'GET'
346: get( :url => form.action.gsub(/\?[^\?]*$/, ''),
347: :params => form.build_query,
348: :headers => headers,
349: :referer => form.page
350: )
351: else
352: raise "unsupported method: #{form.method.upcase}"
353: end
354: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/www/mechanize.rb, line 376
376: def transact
377: history_backup = @history.dup
378: begin
379: yield self
380: ensure
381: @history = history_backup
382: end
383: end
Returns whether or not a url has been visited
# File lib/www/mechanize.rb, line 362
362: def visited?(url)
363: ! visited_page(url).nil?
364: end
# File lib/www/mechanize.rb, line 584
584: def add_to_history(page)
585: @history.push(page, resolve(page.uri))
586: history_added.call(page) if history_added
587: end
uri is an absolute URI
# File lib/www/mechanize.rb, line 419
419: def fetch_page(params)
420: options = {
421: :request => nil,
422: :response => nil,
423: :connection => nil,
424: :referer => current_page(),
425: :uri => nil,
426: :verb => :get,
427: :agent => self,
428: :redirects => 0,
429: :params => [],
430: :headers => {},
431: }.merge(params)
432:
433: before_connect = Chain.new([
434: Chain::URIResolver.new(@scheme_handlers),
435: Chain::ParameterResolver.new,
436: Chain::RequestResolver.new,
437: Chain::ConnectionResolver.new(
438: @connection_cache,
439: @keep_alive,
440: @proxy_addr,
441: @proxy_port,
442: @proxy_user,
443: @proxy_pass
444: ),
445: Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
446: Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
447: Chain::HeaderResolver.new(
448: @keep_alive,
449: @keep_alive_time,
450: @cookie_jar,
451: @user_agent,
452: {}
453: ),
454: Chain::CustomHeaders.new,
455: @pre_connect_hook,
456: ])
457: before_connect.handle(options)
458:
459: uri = options[:uri]
460: request = options[:request]
461: cur_page = options[:referer]
462: request_data = options[:params]
463: redirects = options[:redirects]
464: http_obj = options[:connection]
465:
466: # Add If-Modified-Since if page is in history
467: if( (page = visited_page(uri)) && page.response['Last-Modified'] )
468: request['If-Modified-Since'] = page.response['Last-Modified']
469: end if(@conditional_requests)
470:
471: # Specify timeouts if given
472: http_obj.open_timeout = @open_timeout if @open_timeout
473: http_obj.read_timeout = @read_timeout if @read_timeout
474: http_obj.start unless http_obj.started?
475:
476: # Log specified headers for the request
477: log.info("#{ request.class }: #{ request.path }") if log
478: request.each_header do |k, v|
479: log.debug("request-header: #{ k } => #{ v }")
480: end if log
481:
482: # Send the request
483: attempts = 0
484: begin
485: response = http_obj.request(request, *request_data) { |r|
486: connection_chain = Chain.new([
487: Chain::ResponseReader.new(r),
488: Chain::BodyDecodingHandler.new,
489: ])
490: connection_chain.handle(options)
491: }
492: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
493: log.error("Rescuing EOF error") if log
494: http_obj.finish
495: raise x if attempts >= 2
496: request.body = nil
497: http_obj.start
498: attempts += 1
499: retry
500: end
501:
502: after_connect = Chain.new([
503: @post_connect_hook,
504: Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
505: Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
506: ])
507: after_connect.handle(options)
508:
509: res_klass = options[:res_klass]
510: response_body = options[:response_body]
511: page = options[:page]
512:
513: log.info("status: #{ page.code }") if log
514:
515: if follow_meta_refresh
516: redirect_uri = nil
517: referer = page
518: if (page.respond_to?(:meta) && (redirect = page.meta.first))
519: redirect_uri = redirect.uri.to_s
520: sleep redirect.node['delay'].to_f
521: referer = Page.new(nil, {'content-type'=>'text/html'})
522: elsif refresh = response['refresh']
523: delay, redirect_uri = Page::Meta.parse(refresh, uri)
524: raise StandardError, "Invalid refresh http header" unless delay
525: if redirects + 1 > redirection_limit
526: raise RedirectLimitReachedError.new(page, redirects)
527: end
528: sleep delay.to_f
529: end
530: if redirect_uri
531: @history.push(page, page.uri)
532: return fetch_page(
533: :uri => redirect_uri,
534: :referer => referer,
535: :params => [],
536: :verb => :get,
537: :redirects => redirects + 1
538: )
539: end
540: end
541:
542: return page if res_klass <= Net::HTTPSuccess
543:
544: if res_klass == Net::HTTPNotModified
545: log.debug("Got cached page") if log
546: return visited_page(uri) || page
547: elsif res_klass <= Net::HTTPRedirection
548: return page unless follow_redirect?
549: log.info("follow redirect to: #{ response['Location'] }") if log
550: from_uri = page.uri
551: raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
552: redirect_verb = options[:verb] == :head ? :head : :get
553: page = fetch_page( :uri => response['Location'].to_s,
554: :referer => page,
555: :params => [],
556: :verb => redirect_verb,
557: :redirects => redirects + 1
558: )
559: @history.push(page, from_uri)
560: return page
561: elsif res_klass <= Net::HTTPUnauthorized
562: raise ResponseCodeError.new(page) unless @user || @password
563: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
564: if response['www-authenticate'] =~ /Digest/i
565: @auth_hash[uri.host] = :digest
566: if response['server'] =~ /Microsoft-IIS/
567: @auth_hash[uri.host] = :iis_digest
568: end
569: @digest = response['www-authenticate']
570: else
571: @auth_hash[uri.host] = :basic
572: end
573: return fetch_page( :uri => uri,
574: :referer => cur_page,
575: :verb => request.method.downcase.to_sym,
576: :params => request_data,
577: :headers => options[:headers]
578: )
579: end
580:
581: raise ResponseCodeError.new(page), "Unhandled response", caller
582: end
# File lib/www/mechanize.rb, line 397
397: def post_form(url, form, headers = {})
398: cur_page = form.page || current_page ||
399: Page.new( nil, {'content-type'=>'text/html'})
400:
401: request_data = form.request_data
402:
403: log.debug("query: #{ request_data.inspect }") if log
404:
405: # fetch the page
406: page = fetch_page( :uri => url,
407: :referer => cur_page,
408: :verb => :post,
409: :params => [request_data],
410: :headers => {
411: 'Content-Type' => form.enctype,
412: 'Content-Length' => request_data.size.to_s,
413: }.merge(headers))
414: add_to_history(page)
415: page
416: end