ScraperWiki Classic has gone read-only.
Try the new ScraperWiki
or
Transfer this scraper to Morph.io
A temporary version of your scraper has been saved.
Discard this scraper
This scraper is protected: You can only watch other people editing.
Make my own copy of this scraper
Warning:
You are viewing an earlier version of this scraper. If you save, the newest version will be overwritten!
XXXX
is currently editing this scraper.
Steal control
Back to scraper overview
import scraperwiki import urllib2, urllib, urlparse import re import datetime import httplib import time import mechanize import lxml.html # this should efficiently scrape the most recent wells in the database # and make backups when changed. (Could be done with triggers, but that's too advanced) url = "https://www.og.decc.gov.uk/pls/wons/wdep0100.qryWell" nowtime = datetime.datetime.now().isoformat() def Main(): ScrapeWellLinks() ScrapeNewPages() RescrapeRecentPages() def ScrapeWellPage(welllink): contents = urllib.urlopen(welllink["wellurl"]).read() if re.search("403 = Request forbidden", contents): print "Bad request", contents return False if "contents" in welllink and welllink["contents"] == contents: #print "rescrape no change", welllink return False print "newwellpage" welldata = parsewell(contents) if not welldata: return False welldata.update(welllink) # merges in all those parsed fields (both tables with same initial columns) welldata["contents"] = contents welldata["scrapedate"] = nowtime scraperwiki.sqlite.save(["quadrant", "block", "wellno", "scrapedate"], welldata, table_name="wellpages", verbose=0) return True def ScrapeNewPages(): # do just the new ones qlist = "welllinks.quadrant, welllinks.block, welllinks.wellno, welllinks.wellurl" qjoin = " left join wellpages on wellpages.quadrant=welllinks.quadrant and wellpages.block=welllinks.block and wellpages.wellno=welllinks.wellno" while True: try: welllinks = scraperwiki.sqlite.select("%s from welllinks %s where wellpages.contents is null limit 30" % (qlist, qjoin)) except scraperwiki.sqlite.NoSuchTableSqliteError: welllinks = scraperwiki.sqlite.select("%s from welllinks limit 10" % (qlist)) if not welllinks: break print "Scraping new pages", len(welllinks) for welllink in welllinks: ScrapeWellPage(welllink) # I can't work out the way to filter by max scrapedate across the whole row, so filtering by comparing with the previous row and skipping def RescrapeRecentPages(): qlist = "quadrant, block, wellno, wellurl, contents, scrapedate" wellpages = scraperwiki.sqlite.select("%s from wellpages order by `Spud Date` desc, quadrant, block, wellno, scrapedate desc limit 200" % qlist) prevwellpage = None for wellpage in wellpages: if prevwellpage and ((prevwellpage["quadrant"], prevwellpage["block"], prevwellpage["wellno"]) == \ (wellpage["quadrant"], wellpage["block"], wellpage["wellno"])): print "skipping", wellpage["quadrant"], wellpage["block"], wellpage["wellno"], wellpage ["scrapedate"] continue prevwellpage = wellpage if wellpage["scrapedate"] == nowtime: continue if ScrapeWellPage(wellpage): print "New data", wellpage def ScrapeWellLinks(): br = mechanize.Browser() br.set_handle_robots(False) # robots.txt doesn't exist, don't waste time looking for it br.open(url) i = 0 while True: br.form = list(br.forms())[0] selectcontrol = br.form.find_control(name='f_quadNoList', type='select') if i >= len(selectcontrol.items): return None item = selectcontrol.items[i] quadrant = item.get_labels()[0].text #print i, "quadrant", quadrant selectcontrol.value = [quadrant] response = br.submit() root = lxml.html.parse(response).getroot() pageurl = br.geturl() # make a table of all the links to individual wellpages which we can sort by priority welltable = root.cssselect("center table")[1] rows = welltable.cssselect("tr") assert [x.text for x in rows[0].cssselect("td em")] == ['Quadrant/Block no', 'Well Numbers'], lxml.html.tostring(rows[0]) ldata = [ ] for row in rows[1:]: tdqb, cont = row.cssselect("td") tdquadrant, tdblock = tdqb.text.split("/") assert tdquadrant == quadrant for a in cont.cssselect("a"): wellno = a.text.strip() wellurl = urlparse.urljoin(pageurl, a.attrib.get("href")) ldata.append({"quadrant":quadrant, "block":tdblock, "wellno":wellno, "wellurl":wellurl}) #print "Number of wells", len(ldata) scraperwiki.sqlite.save(["quadrant", "block", "wellno"], ldata, table_name="welllinks") i += 1 wellkeys = ['Block No', 'Block Suffix', 'Bottom Hole Latitude', 'Bottom Hole Longtitude', 'Completion Date', 'Completion Status', 'Coordinate System', 'Country Code', 'Date TD Reached', 'Datum Elevation', 'Datum Type', 'Deviated Well', 'Drilling Sequence No', 'Geodetic Datum', 'Ground Elevation', 'Onshore Offshore', 'Original Intent', 'Platform', 'Primary Target', 'Quadrant No', 'Slot No', 'Spud Date', 'TVDSS Driller', 'Top Hole Longtitude', 'Top hole Latitude', 'Total MD Driller', 'Total MD Logger', 'Water Depth', 'Well Registration No', 'Wellbore Type'] feetkeys = ['Datum Elevation', 'Ground Elevation', 'Total MD Driller', 'Total MD Logger', 'Water Depth' ] datekeys = ['Completion Date', 'Date TD Reached', 'Spud Date' ] monthnames = ["JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"] def latlongconv(hnamelat, hnamelong, data, coordsystem): hlat = data.pop(hnamelat, "") hlng = data.pop(hnamelong, "") if not hlat or not hlng: return None if hlat == '000 00 00.000' and hlng == '000 00 00.000': return None mlat = re.match("(\d\d\d) (\d\d) (\d\d\.\d\d\d)N", hlat) mlng = re.match("(\d\d\d) (\d\d) (\d\d\.\d\d\d)([EW])", hlng) if not mlat or not mlng: return None # should convert the datum lat = int(mlat.group(1)) + int(mlat.group(2)) / 60.0 + float(mlat.group(3)) / 3600 lng = int(mlng.group(1)) + int(mlng.group(2)) / 60.0 + float(mlng.group(3)) / 3600 if mlng.group(4) == "W": lng = -lng # should convert from ED50 here return lat, lng def parsewell(html): # extract the values and check complete data = dict(re.findall('<A HREF="/pls/wons/wdep0100.basicWellDataEpic.*">(.*?)\.?</A>\s*=\s*(.*?)\s*<BR>', html)) if not data: return None for fkey in ['Datum Elevation','Ground Elevation','Total MD Driller','Total MD Logger','Water Depth' ]: fkeyfeet = data.pop(fkey + ' (feet)', None) fkeymetres = data.pop(fkey + ' (metres)', None) if fkeyfeet == 'NULL Value' or fkeymetres == 'NULL Value': data[fkey] = 'NULL Value' elif fkeymetres: data[fkey] = float(fkeymetres) elif fkeyfeet: data[fkey] = float(fkeyfeet) * 0.3048 data['Onshore Offshore'] = data.pop('Onshore/Offshore') skeys = sorted(data.keys()) if skeys != wellkeys: print (skeys, wellkeys, html), "\n\ntruncated?" return None # remove NULL Values and convert dates for key in data.keys(): if data[key] == 'NULL Value' or data[key] == '': del data[key] elif key in datekeys: sdate = data[key] data[key] = datetime.date(int(sdate[7:]), monthnames.index(sdate[3:6])+1, int(sdate[:2])).isoformat() # 22-APR-1985 # get coordinates coordsystem = data.pop('Coordinate System') assert coordsystem == data.pop('Geodetic Datum') assert coordsystem == 'ED50' latlngbottom = latlongconv('Bottom Hole Latitude', 'Bottom Hole Longtitude', data, coordsystem) if latlngbottom: data['bottomhole_lat'], data['bottomhole_lng'] = latlngbottom latlngtop = latlongconv('Top hole Latitude', 'Top Hole Longtitude', data, coordsystem) if latlngtop: data['tophole_lat'], data['tophole_lng'] = latlngtop assert data.get('Completion Status', None) in ['Completed', 'Suspended', 'Abandoned', None] if data.get("Datum Type") == "Kelly Bushing": data["Datum Type"] = "KB" return data Main()
Documentation
You are editing,
Transfer to Morph.io
Console
Data
Sources
Chat
-1
+1
PREVREV
DATE
CURRREV