Capybara, poltergeistでTimed out waiting for response toエラーが出る。
http://ja.aliexpress.com/wholesale?catId=0&initiative_id=AS_20151205231131&SearchText=bag+mens
を起点に、商品ページのURLリストを取得し、その数を出力します。
たまに、期待した出力である6が出力されるのですが、
下記のようなエラーが出てしまう時があります。エラー8割、正常2割くらいの割合です。
何が原因かがわからず、途方に暮れています。
何かヒントをいただけませんか?
require 'open-uri'
require 'nokogiri'
require 'pstore'
require 'anemone'
require 'open_uri_redirections'
require 'openssl'
require 'selenium-webdriver'
require 'capybara'
require 'capybara/poltergeist'
#require 'phantomjs'
class Crawl
def get_html_obj(url)
Capybara.run_server = false
Capybara.register_driver :poltergeist do |app|
Capybara::Poltergeist::Driver.new(app, {js_errors: false})
end
session = Capybara::Session.new(:poltergeist)
session.visit(url)
return Nokogiri::HTML.parse(session.html)
end
def get_page_list(url)
url_list = Array.new
doc = get_html_obj(url)
doc.css('div[class = "ui-pagination-navi util-left"]>a').each do |page|
url_list << page.attribute("href").text
end
url_list = url_list.sort_by do |k|
/(.+)(page=)([\d]+)/ =~ k
$3.to_i
end
return url_list.uniq
end
def get_page_list_all(url,n)
url_list_all = Array.new
crawl_url = Crawl.new
last_url = url
n.times do |n|
url_list = crawl_url.get_page_list(last_url)
last_url = url_list.last
url_list_all.concat(url_list)
end
url_list_all = url_list_all.sort_by do |k|
/(.+)(page=)([\d]+)/ =~ k
$3.to_i
end
return url_list_all.uniq
end
def get_product_list(url)
url_list = Array.new
doc = get_html_obj(url)
doc.css('a[class = "history-item product "]').each do |page|
url_list << page.attribute("href").text
end
return url_list
end
def get_product_list_all(url,n)
url_list = Array.new
get_page_list_all(url,n).each do |url_tmp|
doc = get_html_obj(url_tmp)
doc.css('a[class = "history-item product "]').each do |page|
url_list << page.attribute("href").text
end
end
return url_list
end
def get_product_obj(url)
obj = Crawl.new
return obj.get_html_obj(url)
end
def get_product_objs(url)
obj = Crawl.new
products = Array.new
urls = obj.get_product_list(url)
urls.each do |url|
products << obj.get_html_obj(url)
end
return products
end
def get_product_objs_all(url,n)
obj = Crawl.new
products = Array.new
urls = obj.get_page_list_all(url,n)
urls.each do |url|
products << obj.get_html_obj(url)
end
return products
end
end
init_url = "http://ja.aliexpress.com/wholesale?catId=0&initiative_id=AS_20151205231131&SearchText=bag+mens"
crawl = Crawl.new
puts crawl.get_page_list(init_url).count
C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/poltergeist-1.8.1/lib/capybara/poltergeist/web_socket_server.rb:98:in `rescue in send': Timed out waiting for response to {"id":"8482ca9f-6a3d-40d2-b6b7-eb33c8327b41","name":"visit","args":["http://ja.aliexpress.com/wholesale?catId=0&initiative_id=AS_20151205231131&SearchText=bag+mens"]}. It's possible that this happened because something took a very long time (for example a page load was slow). If so, setting the Poltergeist :timeout option to a higher value will help (see the docs for details). If increasing the timeout does not help, this is probably a bug in Poltergeist - please report it to the issue tracker. (Capybara::Poltergeist::TimeoutError)
from C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/poltergeist-1.8.1/lib/capybara/poltergeist/web_socket_server.rb:94:in `send'
from C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/poltergeist-1.8.1/lib/capybara/poltergeist/server.rb:33:in `send'
from C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/poltergeist-1.8.1/lib/capybara/poltergeist/browser.rb:340:in `command'
from C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/poltergeist-1.8.1/lib/capybara/poltergeist/browser.rb:34:in `visit'
from C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/poltergeist-1.8.1/lib/capybara/poltergeist/driver.rb:95:in `visit'
from C:/Ruby21-x64/lib/ruby/gems/2.1.0/gems/capybara-2.5.0/lib/capybara/session.rb:232:in `visit'
from C:/Users/Akinori/Documents/NetBeansProjects/RubyApplication1/lib/main2.rb:20:in `get_html_obj'
from C:/Users/Akinori/Documents/NetBeansProjects/RubyApplication1/lib/main2.rb:26:in `get_page_list'
from C:/Users/Akinori/Documents/NetBeansProjects/RubyApplication1/lib/main2.rb:109:in `<main>'