bot:
name: "My-simple-bot"
delay: 5
spiders: 1
empty_queue_retries: 100
request_timeout: 300
old_data_expire: 604800 # 7 days
entry:
example.com:
refetch: 86400 # A day
base: 'http://example.com'
start_urls:
- 'http://example.com/start/a'
- 'http://example.com/start/b'
deny_url:
- 'http://example.com/deny/x'
link_containers:
- '//div[contains(@class, "categories_links")]/a'
- '//ul[contains(@class, "pagination")]/li/a'
- '//div[@id="content"]//div[contains(@class, "product-thumb")]//h4/a'
exist_attribute:
id: "//input[@name='product_id']/@value"
brand: '//div[@id="product_information_2"]//div[contains(@class, "Brand-Logo")]/img/@alt'
attributes:
name: "//h1[contains(@class, 'product-title')]/span/text()"
code: "//div[contains(@class, 'product-model')]/text()"
price1: "//li[@itemprop='price']/text()"
price2: "//h2[@itemprop='price']/text()"
price3: ""
price_old: "//div[@id='product_information_2']//li[contains(@class, 'price-old')]/text()"
available: '//div[contains(@class, "stock-li")]//text()'
lists:
- name: 'tags'
key: './/td[1]/text()'
value: './/td[2]/text()'
row: "//div[@id='tab-specification']/table/tbody/tr"
options:
- type: 'select'
selector: "//div[@id='product']//select"
categories:
# selector: '//ul[contains(@class, "breadcrumb")]/li'
# value: ".//a/span/text()"
htmls:
- name: "description"
selector: "//div[@id='tab-description']/*"
images:
main: "//div[@id='myElement']//img[@id='main']/@data-src"
additional: "//ul[contains(@class, 'slides')]//img/@data-for-target"
IO:
complete_domains: "completed.json"
incomplete_domains: "incomplete.json"
directory_domains: "directories.json"
feed_data: "feed.json"
logger:
name: "scraper.log"
format: "%(asctime)s %(levelname)-8s [%(thread)d](%(module)s)(%(lineno)-3d) %(message)s"