The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems' require 'mechanize' require 'logger' agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") } agent.user_agent_alias = 'Mac Safari' page = agent.get("http://www.google.com/") search_form = page.form_with(:name => "f") search_form.field_with(:name => "q").value = "Hello" search_results = agent.submit(search_form) puts search_results.body
VERSION | = | '0.9.3' | The version of Mechanize you are using. | |
AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases |
redirect_ok | -> | follow_redirect? |
ca_file | [RW] | |
cert | [RW] | |
conditional_requests | [RW] | |
cookie_jar | [RW] | |
follow_meta_refresh | [RW] | |
history | [R] | |
history_added | [RW] | |
html_parser | [RW] | The HTML parser to be used when parsing documents |
html_parser | [RW] | |
keep_alive | [RW] | |
keep_alive_time | [RW] | |
key | [RW] | |
log | [RW] | |
open_timeout | [RW] | |
pass | [RW] | |
pluggable_parser | [R] | |
read_timeout | [RW] | |
redirect_ok | [RW] | |
redirection_limit | [RW] | |
request_headers | [RW] | A hash of custom request headers |
scheme_handlers | [RW] | |
user_agent | [RW] | |
verify_callback | [RW] | |
watch_for_set | [RW] |
# File lib/www/mechanize.rb, line 100 100: def initialize 101: # attr_accessors 102: @cookie_jar = CookieJar.new 103: @log = nil 104: @open_timeout = nil 105: @read_timeout = nil 106: @user_agent = AGENT_ALIASES['Mechanize'] 107: @watch_for_set = nil 108: @history_added = nil 109: @ca_file = nil # OpenSSL server certificate file 110: 111: # callback for OpenSSL errors while verifying the server certificate 112: # chain, can be used for debugging or to ignore errors by always 113: # returning _true_ 114: @verify_callback = nil 115: @cert = nil # OpenSSL Certificate 116: @key = nil # OpenSSL Private Key 117: @pass = nil # OpenSSL Password 118: @redirect_ok = true # Should we follow redirects? 119: 120: # attr_readers 121: @history = WWW::Mechanize::History.new 122: @pluggable_parser = PluggableParser.new 123: 124: # Auth variables 125: @user = nil # Auth User 126: @password = nil # Auth Password 127: @digest = nil # DigestAuth Digest 128: @auth_hash = {} # Keep track of urls for sending auth 129: @request_headers= {} # A hash of request headers to be used 130: 131: # Proxy settings 132: @proxy_addr = nil 133: @proxy_pass = nil 134: @proxy_port = nil 135: @proxy_user = nil 136: 137: @conditional_requests = true 138: 139: @follow_meta_refresh = false 140: @redirection_limit = 20 141: 142: # Connection Cache & Keep alive 143: @connection_cache = {} 144: @keep_alive_time = 300 145: @keep_alive = true 146: 147: @scheme_handlers = Hash.new { |h,k| 148: h[k] = lambda { |link, page| 149: raise UnsupportedSchemeError.new(k) 150: } 151: } 152: @scheme_handlers['http'] = lambda { |link, page| link } 153: @scheme_handlers['https'] = @scheme_handlers['http'] 154: @scheme_handlers['relative'] = @scheme_handlers['http'] 155: @scheme_handlers['file'] = @scheme_handlers['http'] 156: 157: @pre_connect_hook = Chain::PreConnectHook.new 158: @post_connect_hook = Chain::PostConnectHook.new 159: 160: @html_parser = self.class.html_parser 161: 162: yield self if block_given? 163: end
Sets the user and password to be used for authentication.
# File lib/www/mechanize.rb, line 196 196: def auth(user, password) 197: @user = user 198: @password = password 199: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/www/mechanize.rb, line 292 292: def click(link) 293: referer = link.page rescue referer = nil 294: href = link.respond_to?(:href) ? link.href : 295: (link['href'] || link['src']) 296: get(:url => href, :referer => (referer || current_page())) 297: end
DELETE to url with query_params, and setting options:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 261 261: def delete(url, query_params = {}, options = {}) 262: page = head(url, query_params, options.merge({:verb => :delete})) 263: add_to_history(page) 264: page 265: end
Fetches the URL passed in and returns a page.
# File lib/www/mechanize.rb, line 203 203: def get(options, parameters = [], referer = nil) 204: unless options.is_a? Hash 205: url = options 206: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 207: referer = parameters 208: parameters = [] 209: end 210: else 211: raise ArgumentError.new("url must be specified") unless url = options[:url] 212: parameters = options[:params] || [] 213: referer = options[:referer] 214: headers = options[:headers] 215: end 216: 217: unless referer 218: if url.to_s =~ /^http/ 219: referer = Page.new(nil, {'content-type'=>'text/html'}) 220: else 221: referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) 222: end 223: end 224: 225: # FIXME: Huge hack so that using a URI as a referer works. I need to 226: # refactor everything to pass around URIs but still support 227: # WWW::Mechanize::Page#base 228: unless referer.is_a?(WWW::Mechanize::File) 229: referer = referer.is_a?(String) ? 230: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : 231: Page.new(referer, {'content-type' => 'text/html'}) 232: end 233: 234: # fetch the page 235: page = fetch_page( :uri => url, 236: :referer => referer, 237: :headers => headers || {}, 238: :params => parameters 239: ) 240: add_to_history(page) 241: yield page if block_given? 242: page 243: end
Fetch a file and return the contents of the file.
# File lib/www/mechanize.rb, line 286 286: def get_file(url) 287: get(url).body 288: end
HEAD to url with query_params, and setting options:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 272 272: def head(url, query_params = {}, options = {}) 273: options = { 274: :uri => url, 275: :headers => {}, 276: :params => query_params, 277: :verb => :head 278: }.merge(options) 279: # fetch the page 280: page = fetch_page(options) 281: yield page if block_given? 282: page 283: end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/www/mechanize.rb, line 311 311: def post(url, query={}) 312: node = {} 313: # Create a fake form 314: class << node 315: def search(*args); []; end 316: end 317: node['method'] = 'POST' 318: node['enctype'] = 'application/x-www-form-urlencoded' 319: 320: form = Form.new(node) 321: query.each { |k,v| 322: if v.is_a?(IO) 323: form.enctype = 'multipart/form-data' 324: ul = Form::FileUpload.new(k.to_s,::File.basename(v.path)) 325: ul.file_data = v.read 326: form.file_uploads << ul 327: else 328: form.fields << Form::Field.new(k.to_s,v) 329: end 330: } 331: post_form(url, form) 332: end
# File lib/www/mechanize.rb, line 174 174: def post_connect_hooks 175: @post_connect_hook.hooks 176: end
# File lib/www/mechanize.rb, line 170 170: def pre_connect_hooks 171: @pre_connect_hook.hooks 172: end
PUT to url with query_params, and setting options:
put('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 250 250: def put(url, query_params = {}, options = {}) 251: page = head(url, query_params, options.merge({:verb => :put})) 252: add_to_history(page) 253: page 254: end
Sets the proxy address, port, user, and password addr should be a host, with no "http://"
# File lib/www/mechanize.rb, line 180 180: def set_proxy(addr, port, user = nil, pass = nil) 181: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass 182: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com') agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/www/mechanize.rb, line 340 340: def submit(form, button=nil, headers={}) 341: form.add_button_to_query(button) if button 342: case form.method.upcase 343: when 'POST' 344: post_form(form.action, form, headers) 345: when 'GET' 346: get( :url => form.action.gsub(/\?[^\?]*$/, ''), 347: :params => form.build_query, 348: :headers => headers, 349: :referer => form.page 350: ) 351: else 352: raise "unsupported method: #{form.method.upcase}" 353: end 354: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/www/mechanize.rb, line 376 376: def transact 377: history_backup = @history.dup 378: begin 379: yield self 380: ensure 381: @history = history_backup 382: end 383: end
Returns whether or not a url has been visited
# File lib/www/mechanize.rb, line 362 362: def visited?(url) 363: ! visited_page(url).nil? 364: end
# File lib/www/mechanize.rb, line 584 584: def add_to_history(page) 585: @history.push(page, resolve(page.uri)) 586: history_added.call(page) if history_added 587: end
uri is an absolute URI
# File lib/www/mechanize.rb, line 419 419: def fetch_page(params) 420: options = { 421: :request => nil, 422: :response => nil, 423: :connection => nil, 424: :referer => current_page(), 425: :uri => nil, 426: :verb => :get, 427: :agent => self, 428: :redirects => 0, 429: :params => [], 430: :headers => {}, 431: }.merge(params) 432: 433: before_connect = Chain.new([ 434: Chain::URIResolver.new(@scheme_handlers), 435: Chain::ParameterResolver.new, 436: Chain::RequestResolver.new, 437: Chain::ConnectionResolver.new( 438: @connection_cache, 439: @keep_alive, 440: @proxy_addr, 441: @proxy_port, 442: @proxy_user, 443: @proxy_pass 444: ), 445: Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass), 446: Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest), 447: Chain::HeaderResolver.new( 448: @keep_alive, 449: @keep_alive_time, 450: @cookie_jar, 451: @user_agent, 452: {} 453: ), 454: Chain::CustomHeaders.new, 455: @pre_connect_hook, 456: ]) 457: before_connect.handle(options) 458: 459: uri = options[:uri] 460: request = options[:request] 461: cur_page = options[:referer] 462: request_data = options[:params] 463: redirects = options[:redirects] 464: http_obj = options[:connection] 465: 466: # Add If-Modified-Since if page is in history 467: if( (page = visited_page(uri)) && page.response['Last-Modified'] ) 468: request['If-Modified-Since'] = page.response['Last-Modified'] 469: end if(@conditional_requests) 470: 471: # Specify timeouts if given 472: http_obj.open_timeout = @open_timeout if @open_timeout 473: http_obj.read_timeout = @read_timeout if @read_timeout 474: http_obj.start unless http_obj.started? 475: 476: # Log specified headers for the request 477: log.info("#{ request.class }: #{ request.path }") if log 478: request.each_header do |k, v| 479: log.debug("request-header: #{ k } => #{ v }") 480: end if log 481: 482: # Send the request 483: attempts = 0 484: begin 485: response = http_obj.request(request, *request_data) { |r| 486: connection_chain = Chain.new([ 487: Chain::ResponseReader.new(r), 488: Chain::BodyDecodingHandler.new, 489: ]) 490: connection_chain.handle(options) 491: } 492: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x 493: log.error("Rescuing EOF error") if log 494: http_obj.finish 495: raise x if attempts >= 2 496: request.body = nil 497: http_obj.start 498: attempts += 1 499: retry 500: end 501: 502: after_connect = Chain.new([ 503: @post_connect_hook, 504: Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set), 505: Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache), 506: ]) 507: after_connect.handle(options) 508: 509: res_klass = options[:res_klass] 510: response_body = options[:response_body] 511: page = options[:page] 512: 513: log.info("status: #{ page.code }") if log 514: 515: if follow_meta_refresh 516: redirect_uri = nil 517: referer = page 518: if (page.respond_to?(:meta) && (redirect = page.meta.first)) 519: redirect_uri = redirect.uri.to_s 520: sleep redirect.node['delay'].to_f 521: referer = Page.new(nil, {'content-type'=>'text/html'}) 522: elsif refresh = response['refresh'] 523: delay, redirect_uri = Page::Meta.parse(refresh, uri) 524: raise StandardError, "Invalid refresh http header" unless delay 525: if redirects + 1 > redirection_limit 526: raise RedirectLimitReachedError.new(page, redirects) 527: end 528: sleep delay.to_f 529: end 530: if redirect_uri 531: @history.push(page, page.uri) 532: return fetch_page( 533: :uri => redirect_uri, 534: :referer => referer, 535: :params => [], 536: :verb => :get, 537: :redirects => redirects + 1 538: ) 539: end 540: end 541: 542: return page if res_klass <= Net::HTTPSuccess 543: 544: if res_klass == Net::HTTPNotModified 545: log.debug("Got cached page") if log 546: return visited_page(uri) || page 547: elsif res_klass <= Net::HTTPRedirection 548: return page unless follow_redirect? 549: log.info("follow redirect to: #{ response['Location'] }") if log 550: from_uri = page.uri 551: raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit 552: redirect_verb = options[:verb] == :head ? :head : :get 553: page = fetch_page( :uri => response['Location'].to_s, 554: :referer => page, 555: :params => [], 556: :verb => redirect_verb, 557: :redirects => redirects + 1 558: ) 559: @history.push(page, from_uri) 560: return page 561: elsif res_klass <= Net::HTTPUnauthorized 562: raise ResponseCodeError.new(page) unless @user || @password 563: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) 564: if response['www-authenticate'] =~ /Digest/i 565: @auth_hash[uri.host] = :digest 566: if response['server'] =~ /Microsoft-IIS/ 567: @auth_hash[uri.host] = :iis_digest 568: end 569: @digest = response['www-authenticate'] 570: else 571: @auth_hash[uri.host] = :basic 572: end 573: return fetch_page( :uri => uri, 574: :referer => cur_page, 575: :verb => request.method.downcase.to_sym, 576: :params => request_data, 577: :headers => options[:headers] 578: ) 579: end 580: 581: raise ResponseCodeError.new(page), "Unhandled response", caller 582: end
# File lib/www/mechanize.rb, line 397 397: def post_form(url, form, headers = {}) 398: cur_page = form.page || current_page || 399: Page.new( nil, {'content-type'=>'text/html'}) 400: 401: request_data = form.request_data 402: 403: log.debug("query: #{ request_data.inspect }") if log 404: 405: # fetch the page 406: page = fetch_page( :uri => url, 407: :referer => cur_page, 408: :verb => :post, 409: :params => [request_data], 410: :headers => { 411: 'Content-Type' => form.enctype, 412: 'Content-Length' => request_data.size.to_s, 413: }.merge(headers)) 414: add_to_history(page) 415: page 416: end