North sea oil wells | ScraperWiki

ScraperWiki Classic has gone read-only. Try the new ScraperWiki or Transfer this scraper to Morph.io

A temporary version of your scraper has been saved. Discard this scraper

This scraper is protected: You can only watch other people editing. Make my own copy of this scraper

Warning: You are viewing an earlier version of this scraper. If you save, the newest version will be overwritten!

XXXX is currently editing this scraper. Steal control

import scraperwiki
import urllib2, urllib, urlparse
import re
import datetime
import httplib
import time
import mechanize
import lxml.html

# this should efficiently scrape the most recent wells in the database
# and make backups when changed.  (Could be done with triggers, but that's too advanced)

url = "https://www.og.decc.gov.uk/pls/wons/wdep0100.qryWell"
nowtime = datetime.datetime.now().isoformat()

def Main():
    ScrapeWellLinks()
    ScrapeNewPages()
    RescrapeRecentPages()

def ScrapeWellPage(welllink):
    contents = urllib.urlopen(welllink["wellurl"]).read()
    if re.search("403 = Request forbidden", contents):
        print "Bad request", contents
        return False
    if "contents" in welllink and welllink["contents"] == contents:
        #print "rescrape no change", welllink
        return False

print "newwellpage"
    welldata = parsewell(contents)
    if not welldata:
        return False
    welldata.update(welllink)   # merges in all those parsed fields (both tables with same initial columns)
    welldata["contents"] = contents
    welldata["scrapedate"] = nowtime
    scraperwiki.sqlite.save(["quadrant", "block", "wellno", "scrapedate"], welldata, table_name="wellpages", verbose=0)
    return True

def ScrapeNewPages():
    # do just the new ones
    qlist = "welllinks.quadrant, welllinks.block, welllinks.wellno, welllinks.wellurl"
    qjoin = " left join wellpages on wellpages.quadrant=welllinks.quadrant and wellpages.block=welllinks.block and wellpages.wellno=welllinks.wellno"
    while True:
        try:
            welllinks = scraperwiki.sqlite.select("%s from welllinks %s where wellpages.contents is null limit 30" % (qlist, qjoin))
        except scraperwiki.sqlite.NoSuchTableSqliteError:
            welllinks = scraperwiki.sqlite.select("%s from welllinks limit 10" % (qlist))
        if not welllinks:
            break
        
        print "Scraping new pages", len(welllinks)
        for welllink in welllinks:
            ScrapeWellPage(welllink)

# I can't work out the way to filter by max scrapedate across the whole row, so filtering by comparing with the previous row and skipping
def RescrapeRecentPages():
    qlist = "quadrant, block, wellno, wellurl, contents, scrapedate"
    wellpages = scraperwiki.sqlite.select("%s from wellpages order by `Spud Date` desc, quadrant, block, wellno, scrapedate desc limit 200" % qlist)
    prevwellpage = None
    for wellpage in wellpages:
        if prevwellpage and ((prevwellpage["quadrant"], prevwellpage["block"], prevwellpage["wellno"]) == \
                             (wellpage["quadrant"], wellpage["block"], wellpage["wellno"])):
            print "skipping", wellpage["quadrant"], wellpage["block"], wellpage["wellno"], wellpage ["scrapedate"]
            continue
        prevwellpage = wellpage
        if wellpage["scrapedate"] == nowtime:
            continue
        if ScrapeWellPage(wellpage):
            print "New data", wellpage

def ScrapeWellLinks():
    br = mechanize.Browser()
    br.set_handle_robots(False) # robots.txt doesn't exist, don't waste time looking for it
    br.open(url)
    i = 0  
    while True:
        br.form = list(br.forms())[0]
        selectcontrol = br.form.find_control(name='f_quadNoList', type='select')
        if i >= len(selectcontrol.items):
            return None
        item = selectcontrol.items[i]
        quadrant = item.get_labels()[0].text
        #print i, "quadrant", quadrant
        selectcontrol.value = [quadrant]
        response = br.submit()
        root = lxml.html.parse(response).getroot()
        pageurl = br.geturl()

# make a table of all the links to individual wellpages which we can sort by priority
        welltable = root.cssselect("center table")[1]
        rows = welltable.cssselect("tr")
        assert [x.text  for x in rows[0].cssselect("td em")] == ['Quadrant/Block no', 'Well Numbers'], lxml.html.tostring(rows[0])
        ldata = [ ]
        for row in rows[1:]:
            tdqb, cont = row.cssselect("td")
            tdquadrant, tdblock = tdqb.text.split("/")
            assert tdquadrant == quadrant
            for a in cont.cssselect("a"):
                wellno = a.text.strip()
                wellurl = urlparse.urljoin(pageurl, a.attrib.get("href"))
                ldata.append({"quadrant":quadrant, "block":tdblock, "wellno":wellno, "wellurl":wellurl})
        #print "Number of wells", len(ldata)
        scraperwiki.sqlite.save(["quadrant", "block", "wellno"], ldata, table_name="welllinks")

i += 1

wellkeys = ['Block No', 'Block Suffix', 'Bottom Hole Latitude', 'Bottom Hole Longtitude', 'Completion Date',
            'Completion Status', 'Coordinate System', 'Country Code', 'Date TD Reached', 'Datum Elevation', 'Datum Type', 'Deviated Well', 
            'Drilling Sequence No', 'Geodetic Datum', 'Ground Elevation', 'Onshore Offshore', 'Original Intent', 
            'Platform', 'Primary Target', 'Quadrant No', 'Slot No', 'Spud Date', 'TVDSS Driller', 
            'Top Hole Longtitude', 'Top hole Latitude', 'Total MD Driller', 'Total MD Logger', 'Water Depth', 
            'Well Registration No', 'Wellbore Type']

feetkeys = ['Datum Elevation', 'Ground Elevation', 'Total MD Driller', 'Total MD Logger', 'Water Depth' ]
datekeys = ['Completion Date', 'Date TD Reached', 'Spud Date' ]

monthnames = ["JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"]

def latlongconv(hnamelat, hnamelong, data, coordsystem):
    hlat = data.pop(hnamelat, "")
    hlng = data.pop(hnamelong, "")
    if not hlat or not hlng:
        return None
    if hlat == '000 00 00.000' and hlng == '000 00 00.000':
        return None
    
    mlat = re.match("(\d\d\d) (\d\d) (\d\d\.\d\d\d)N", hlat)
    mlng = re.match("(\d\d\d) (\d\d) (\d\d\.\d\d\d)([EW])", hlng)
    if not mlat or not mlng:
        return None
    
    # should convert the datum
    lat = int(mlat.group(1)) + int(mlat.group(2)) / 60.0 + float(mlat.group(3)) / 3600    
    lng = int(mlng.group(1)) + int(mlng.group(2)) / 60.0 + float(mlng.group(3)) / 3600
    if mlng.group(4) == "W":
        lng = -lng

# should convert from ED50 here
    return lat, lng

def parsewell(html):
    # extract the values and check complete
    data = dict(re.findall('<A HREF="/pls/wons/wdep0100.basicWellDataEpic.*">(.*?)\.?</A>\s*=\s*(.*?)\s*<BR>', html))
    if not data:
        return None

for fkey in ['Datum Elevation','Ground Elevation','Total MD Driller','Total MD Logger','Water Depth' ]:
        fkeyfeet = data.pop(fkey + ' (feet)', None)
        fkeymetres = data.pop(fkey + ' (metres)', None)
        if fkeyfeet == 'NULL Value' or fkeymetres == 'NULL Value':
            data[fkey] = 'NULL Value'
        elif fkeymetres:
            data[fkey] = float(fkeymetres)
        elif fkeyfeet:
            data[fkey] = float(fkeyfeet) * 0.3048

data['Onshore Offshore'] = data.pop('Onshore/Offshore')

skeys = sorted(data.keys())
    if skeys != wellkeys:
        print (skeys, wellkeys, html), "\n\ntruncated?"
        return None

# remove NULL Values and convert dates
    for key in data.keys():
        if data[key] == 'NULL Value' or data[key] == '':
            del data[key]
        elif key in datekeys:
            sdate = data[key]
            data[key] = datetime.date(int(sdate[7:]), monthnames.index(sdate[3:6])+1, int(sdate[:2])).isoformat()  
                    # 22-APR-1985

# get coordinates
    coordsystem = data.pop('Coordinate System')
    assert coordsystem == data.pop('Geodetic Datum')
    assert coordsystem == 'ED50'

latlngbottom = latlongconv('Bottom Hole Latitude', 'Bottom Hole Longtitude', data, coordsystem)
    if latlngbottom:
        data['bottomhole_lat'], data['bottomhole_lng'] = latlngbottom
    latlngtop = latlongconv('Top hole Latitude', 'Top Hole Longtitude', data, coordsystem)
    if latlngtop:
        data['tophole_lat'], data['tophole_lng'] = latlngtop

assert data.get('Completion Status', None) in ['Completed', 'Suspended', 'Abandoned', None]

if data.get("Datum Type") == "Kelly Bushing":
        data["Datum Type"] = "KB"

return data

Main()

Documentation

Transfer to Morph.io

Console
Data
Sources
Chat