Last week I posted an article Craigslist Data Scraper (MySQL) detailing a script I wrote in order to find my family a new Ski Boat. After discussing this script with a family friend over the weekend, I was asked if the script could be adapted to use SQLite instead of MySQL. Being the type of person that is always up for a challenge, here is what I came up with. As with the MySQL version, you will need edit the ### Search Parameters ### section with the information you are searching for as well as the ### Email Parameters ### section so you are not sending me your search results. Note, that after running the script the first time, you can safely remove the database creation area annotated by ### Database Creation, remove after first run ###.
#!/usr/bin/env python
'''
clscraper-sqlite - Craigslist Scraper w/SQLite
Copyright (c) 2014 Matthew Marable
Email: mmarable@oitibs.com
Web: https://oitibs.com
clscraper-sqlite is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
clscraper-sqlite is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with clscraper-sqlite. If not, see <http://www.gnu.org/licenses/>.
'''
import sys
import time
import urllib
import datetime
import feedparser
import sqlite3 as db
from mailer import Mailer
from mailer import Message
### Database Initialization ###
con = None
con = db.connect('clscraper.db')
cur = con.cursor()
### Database Creation, remove after first run ###
try:
cur.execute("""
CREATE TABLE IF NOT EXISTS LISTINGS (
`SAID` INTEGER PRIMARY KEY,
`URL` TEXT NOT NULL,
`TITLE` TEXT NOT NULL,
`MODIFIED` TEXT NOT NULL,
`NEW` INTEGER NOT NULL DEFAULT 0)"""
)
except db.error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
### Database Creation, remove after first run ###
### Search Parameters ###
listings = ["boo"]
sites = ["smd","annapolis","baltimore","washingtondc",
"frederick","fredericksburg","westmd","delaware",
"easternshore","martinsburg"]
terms = ["malibu","moomba","nautique","supra","mastercraft",
"centurion","american skier","tige","brendella","axis",
"ski supreme"]
### Craigslist RSS Search URL ###
rss_generic_link = "http://%s.craigslist.org/search/%s?query=%s"
### Generate the RSS links ###
rss_links = []
modified = datetime.datetime.now()
html_text = ""
count = 0
for listing in listings:
for site in sites:
for term in terms:
term = urllib.quote(term)
rss_link = rss_generic_link % (site, listing, term)
rss_link = rss_link + "&format=rss"
rss_links.append(rss_link)
for rss_link in rss_links:
sections = feedparser.parse(rss_link)
for section in sections.entries:
title = section["title"]
url = section["link"]
cur.execute("""
SELECT MODIFIED
FROM LISTINGS
WHERE TITLE = ?""",
(title,)
)
if cur.fetchone() == None:
count = count + 1
html_text = html_text + "<a href='" + url + "' >" + title + "</a><br><br>"
cur.execute("""
INSERT INTO LISTINGS
(URL, TITLE, MODIFIED, NEW)
VALUES (?, ?, ?, ?)""",
(url, title, modified, 1,)
)
con.commit()
con.close()
if count > 0:
### Email Parameters ###
message = Message(From="Search Automation <tech@oitibs.com>",To="mmarable@oitibs.com")
message.Subject = "New Listings (" + str(count) + ")"
message.Html = html_text
sender = Mailer('[SMTP Server IP]')
sender.send(message)
Once you have edited the script to your liking, you can schedule the script to run at intervals using your crontab.
Craigslist Data Scraper (SQLite) Download
Hi! Just want to say thank you for this very informative article about Craigslist.