pycurl is an extremely powerful python module for use in performing HTTP transactions. I’ve spent a great deal of time with it in the past few months. I’ve got a good handle on the most common ways that I’ve been using it. While pycurl is not terribly complicated, I really appreciate simple interfaces, and decided to put a wrapper around pycurl that simplified the interface, yet still supported the bulk of my needs.
What I came up with allows me to process HTTP transactions with 2-3 lines of code. In the spirit of the ‘Easy’ interface to libcurl, I called this the ‘Very Easy’ pycurl interface. I am releasing this under the MIT (or more accurately X11) license.
Save the code below in a file called ‘VEpycurl.py’ and run with it. The code is actually quite short; the three examples that are included consume about 2/3 of the file.
#!/opt/local/bin/python
#You will correct the above to the correct location for your system
"""
Copyright (C) 2009, Kenneth East
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
"""
#
# module for VEpycurl - the Very Easy interface to pycurl
#
from StringIO import StringIO
import urllib
import pycurl
import sys
import os
class VEpycurl() :
"""
A VERY EASY interface to pycurl, v1.0
Tested on 22Feb09 with python 2.5.1, py25-curl 7.19.0, libcurl/7.19.2, OS-X 10.5.6
"""
def __init__(self,
userAgent = 'Mozilla/4.0 (compatible; MSIE 8.0)',
followLocation = 1, # follow redirects?
autoReferer = 1, # allow 'referer' to be set normally?
verifySSL = 0, # tell SSL to verify IDs?
useCookies = True, # will hold all pycurl cookies
useSOCKS = False, # use SOCKS5 proxy?
proxy = 'localhost', # SOCKS host
proxyPort = 8080, # SOCKS port
proxyType = 5, # SOCKS protocol
verbose = False,
debug = False,
) :
self.followLocation = followLocation
self.autoReferer = autoReferer
self.verifySSL = verifySSL
self.useCookies = useCookies
self.useSOCKS = useSOCKS
self.proxy = proxy
self.proxyPort = proxyPort
self.proxyType = proxyType
self.pco = pycurl.Curl()
self.pco.setopt(pycurl.USERAGENT, userAgent)
self.pco.setopt(pycurl.FOLLOWLOCATION, followLocation)
self.pco.setopt(pycurl.MAXREDIRS, 20)
self.pco.setopt(pycurl.CONNECTTIMEOUT, 30)
self.pco.setopt(pycurl.AUTOREFERER, autoReferer)
# SSL verification (True/False)
self.pco.setopt(pycurl.SSL_VERIFYPEER, verifySSL)
self.pco.setopt(pycurl.SSL_VERIFYHOST, verifySSL)
if useCookies == True :
cjf = os.tempnam() # potential security risk here; see python documentation
self.pco.setopt(pycurl.COOKIEFILE, cjf)
self.pco.setopt(pycurl.COOKIEJAR, cjf)
if useSOCKS :
# if you wish to use SOCKS, it is configured through these parms
self.pco.setopt(pycurl.PROXY, proxy)
self.pco.setopt(pycurl.PROXYPORT, proxyPort)
self.pco.setopt(pycurl.PROXYTYPE, proxyType)
if verbose :
self.pco.setopt(pycurl.VERBOSE, 1)
if debug :
print 'PyCurl version info:'
print pycurl.version_info()
print
self.pco.setopt(pycurl.DEBUGFUNCTION, self.debug)
return
def perform(self, url, fields=None, headers=None) :
if fields :
# This is a POST and we have fields to handle
fields = urllib.urlencode(fields)
self.pco.setopt(pycurl.POST, 1)
self.pco.setopt(pycurl.POSTFIELDS, fields)
else :
# This is a GET, and we do nothing with fields
pass
pageContents = StringIO()
self.pco.setopt(pycurl.WRITEFUNCTION, pageContents.write)
self.pco.setopt(pycurl.URL, url)
if headers :
self.pco.setopt(pycurl.HTTPHEADER, headers)
self.pco.perform()
self.pco.close()
self.pc = pageContents
return
def results(self) :
# return the page contents that were received in the most recent perform()
# self.pc is a StringIO object
self.pc.seek(0)
return self.pc
def debug(self, debug_type, debug_msg) :
print 'debug(%d): %s' % (debug_type, debug_msg)
return
try:
# only call this once in a process. see libcurl docs for more info.
pycurl.global_init(pycurl.GLOBAL_ALL)
except:
print 'Fatal error: call to pycurl.global_init() failed for some reason'
sys.exit(1)
if __name__ == '__main__' :
def demoGoogle(name, passwd):
"""
VEpycurl demo for google/picasaweb
Logon using google account. Use google account authentication
token to access picasaweb. Get the XML description of
of the current account status.
My original idea was to quickly create a google example
to demo VEpycurl. I was somewhat surprised to see that it
was more involed than just hacking a couple of URLs.
Fortunately, using curl with google is very well documented at:
http://code.google.com/apis/gdata/articles/using_cURL.html
"""
import re
import xml.dom.minidom
def gat(pc) :
# Get the Auth Token from a GOOGLE login response
auth = re.compile("Auth=(?P[A-Za-z0-9_\-]+)")
for token in pc :
match = auth.match(token)
if match : return match.group('AUTH')
return None
## Login to the google account. This will provide us with
## an authentication token that can be used to access
## any of google's services.
# instantiate a pycurl object for the HTTP operation
ve = VEpycurl()
# setup the URL and post fields required to login
url = 'https://www.google.com/accounts/ClientLogin'
# pf contains the post fields and value. Note that python
# does not specify the ordering of dictionary entries when they
# are accessed iteratively. This means that the order in which
# the post fields will appear in the outgoing POST is undefined.
# For many sites, like google, this is not a problem. However,
# some sites may expect a specific ordering. In that case, I suggest
# the use of an ordered dictionary. I use the one by Armin Ronacher.
# It can literally be taken directly from PEP:372, which is located at
# http://www.python.org/dev/peps/pep-0372/. I did not use it here
# for simplicity and so as to obviate the need for another download.
pf = {'Email' : name,
'Passwd' : passwd,
'accountType' : 'GOOGLE',
'source' : 'VEpycurl demo',
'service' : 'lh2',
}
# perform the POST operation using the above arguments
# Note: if 'fields' is not 'None', 'fields' is expected
# to be a list of post fields and values.
ve.perform(url, fields=pf)
# The post, if successful, returned an auth token used
# to access google services. Grab the authtoken from
# the page that was returned and keep it for later use.
authtoken = gat(ve.results().readlines())
if not authtoken :
print "Error: was not able to login to google."
return
print "Successful google account login"
## Access the picasaweb account associated with this google account.
## Once access is granted, dump all of the specifics about the account.
## This very closely models the google example referred to the the docstring.
ve = VEpycurl()
url = 'http://picasaweb.google.com/data/feed/api/user/default'
# Add the auth token to the default headers that will be sent
hdrs = ['Authorization: GoogleLogin auth=' + authtoken]
# Since 'fields' == None, this will perform a GET
ve.perform(url, fields=None, headers=hdrs)
if re.compile('No such user.').search(ve.results().getvalue()) :
print 'Unable to access picasaweb account.'
print 'Does one exist for this google account?'
return
dom = xml.dom.minidom.parseString(ve.results().getvalue())
print dom.toprettyxml()
return
def demoDrudge(accessSecretly=False, proxy=None, proxyPort=None, proxyType=None):
"""Print the URLs of the 'news' items currently shown on the Drudge Report"""
def printItems(pc) :
import re
tcre = re.compile('')
itre = re.compile('<A HREF="(?P[^"]+)"')
tc = False
for line in ve.results().readlines() :
if tc == False :
if tcre.search(line) : tc = True
else :
match = itre.search(line)
if match : print match.group('URL')
ve = VEpycurl(useSOCKS=accessSecretly)
ve.perform('http://drudgereport.com')
printItems(ve.results())
# DEMO 1
# Print the list of URLs for 'news' articles currently on drudgereport.com
demoDrudge()
# DEMO 2
# Print the list of URLs for 'news' articles currently on drudgereport.com
# Use a previously configured SOCKS proxy so nobody will discover us.
# Uncomment & modify the next line if you are SOCKS enabled; skip it otherwise.
# demoDrudge(accessSecretly=True, proxy='localhost', proxyPort=8080, proxyType=5)
# DEMO 3
# Login to a GOOGLE account. Use the access token provided at login to
# print the current status of the associated picasaweb account. Replace
# 'user' and 'passwd' with the userid and password of the GOOGLE account
# you wish to access.
demoGoogle(name, passwd)