Experiments with regular expressions. A good introduction is available at Google's Python classes. One can also look at the chapter into Dive into Python. An online regular expression tester is available here.
a, X, 9, < -- ordinary characters just match themselves exactly. The meta-characters which do not match themselves because they have special meanings are: . ^ $ * + ? { [ ] | ( ) (details below)
- . (a period) -- matches any single character except newline ''
- -- (lowercase w) matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_]. Note that although "word" is the mnemonic for this, it only matches a single word char, not a whole word. (upper case W) matches any non-word character.
- -- boundary between word and non-word
- -- (lowercase s) matches a single whitespace character -- space, newline, return, tab, form [ ]. (upper case S) matches any non-whitespace character.
- , , -- tab, newline, return
- -- decimal digit [0-9] (some older regex utilities do not support but , but they all support and )
- ^ = start, $ = end -- match the start or end of the string
-- inhibit the "specialness" of a character. So, for example, use . to match a period or \ to match a slash. If you are unsure if a character has special meaning, such as '@', you can put a slash in front of it, @, to make sure it is treated just as a character.
Things get more interesting when you use + and * to specify repetition in the pattern
- "+" -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i's
- "*" -- 0 or more occurrences of the pattern to its left
- "?" -- match 0 or 1 occurrences of the pattern to its left
Testing emails
We first create a file with a list of emails. The goal is to deign a simple regexp in order to check the validity of emails.
%%file tstmails.txt
jf.bercher@esiee.fr
jfb@mailinator.com
daniel.courivaud@esiee.fr
zorro.pancho@arizona-u.edu
tEsT@domain.info
test@my-domain.info
test@my-domain.information
de-quoi-je-me-mèle@adresse.org
The first idea is just to check that an email has the structure "name@domain.ext". Then one can refine the test by noting that name and domain are composed of standard (not extended) ascii characters, plus the ".", "_" and "-". (without "." for the domain part). Furthermore, the ext has a length between 1 and 4.
import re
mail=re.compile("\w+@\w+\.\w{1,4}")
mail2=re.compile(r"""
[\w.-]+ #an alphanumeric character [a-zA-Z0-9_] plus . and -
@ # the @
[\w-]+ #an alphanumeric character [a-zA-Z0-9_] plus -
\. # a .
\w{1,4} # between one and 4 characters
$ # end of string
""",re.ASCII|re.VERBOSE)
with open('tstmails.txt') as f:
mails=f.readlines()
print("-"*42)
print("With 1st test")
print("-"*42)
for tst in mails:
if mail.match(tst):
print("{0:<32s} --- MATCHED".format(tst[:-1]))
else:
print("{0:<32s} --- NOT MATCHED".format(tst[:-1]))
print("-"*42)
print("With 2nt test")
print("-"*42)
for tst in mails:
if mail2.match(tst):
print("{0:<32s} --- MATCHED".format(tst[:-1]))
else:
print("{0:<32s} --- NOT MATCHED".format(tst[:-1]))
Looking for and extracting components of a date
a=!date
a
out=re.search("\d{2}:\d{2}:\d{2}",a[0])
#print(dir(out))
print("Result of first regexp: ",out.group(0))
out=re.search("(\d{2}):(\d{2}):(\d{2})",a[0])
print("Result of second regexp: ",out.group(0), "\n", "group 1: ", out.group(1), "group 3: ",out.group(3))
# with named groups
out=re.search("(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})",a[0])
print("Result of third regexp: ",out.group(0), "\n", "hour: ", out.group('hours'), "minutes: ",out.group('minutes'))
Now we test the findall
version for extracting all the timestamps of a text.
txt="Nuon Chea est d'évidence, après la mort de Pol Pot, le 15 avril 1998, celui que la justice pouvait considérer le plus directement 17:21:35 responsable dans l'élaboration des rouages de la machine de mort mise en place au Cambodge après la chute de Phnom Penh aux 17:24:30 mains des Khmers rouges, le 17 avril 1975. « On peut se demander si Nuon Chea, le “frère numéro deux”, n'était pas presque aussi important que le “frère numéro un” [Pol Pot] », avance même l'historien Henri Locard dans son livre Pourquoi les Khmers rouges ? (éditions Vendémiaire, 2013). Selon lui, 18:20:05 le couple Pol-Nuon formait comme une « hydre à deux têtes"
out=re.search("(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})",txt)
print("Result: ",out.group(0), "\n", "hour: ", out.group('hours'), "minutes: ",out.group('minutes'),"secondes: ",out.group('seconds'))
hours=re.compile("(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})")
result=hours.findall(txt)
for k,h in enumerate(result):
print("hours {0}: ".format(k), h[0],"h ",h[1],"m ",h[2],"s")
Looking for the charset in an html file
The charset can be found in html thanks to the tag meta
- HTML5: <meta charset="UTF-8">
- HTML4: <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
- sometimes <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
# Usin a regular expression
def look_for_encoding(html):
import re
encoding = re.search(b'<meta\s* http-equiv.*?charset=["\']*(.+?)["\'\/]*>', html, flags=re.I)
if not encoding:
encoding = re.search(b'<meta.*?charset=["\']*(.+?)["\'\/]*>', html, flags=re.I)
if not encoding:
encoding='utf8'
return encoding.group(1)
Test: - we give a (moderately) difficult html text -- from here - launch the look_for_encoding
function on it
%%file html_encoding_tst.html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="tr"> <head>
<base href="http://www.r10.net/"/><!--[if IE]></base><![endif]--> <meta property="og:image" content="http://www.r10.net/ogmeta.jpg"/> <meta itemprop="image" content="http://www.r10.net/ogmeta.jpg"> <script type="text/javascript">
//<![CDATA[
try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dokv=88e434a982/"},atok:"1b6c1619cc6cd0bc2fb552cd582bec70",petok:"ddbe7bcfe5ba056091fa074d5173e3802d6ef986-1407230025-1800",zone:"r10.net",rocket:"m",apps:{"brwbl":{"a":"0","brwbl":"38,38,40,40,37,39,37,39,66,65"},"abetterbrowser":{"ie":"7"}}}];CloudFlare.push({"apps":{"ape":"6781678d84514f25f701f9c595588ba7"}});document.write('<script type="text/javascript" src="//ajax.cloudflare.com/cdn-cgi/nexp/dokv=97fb4d042e/cloudflare.min.js"><'+'\/script>');}}catch(e){};
//]]>
</script>
<link rel="image_src" href="http://www.r10.net/ogmeta.jpg"/> <meta property="og:title" content="charset=iso-8859-9, charset=windows-1254, charset=utf-8" kullanımı hakkında yardım."/> <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-9"/> <meta name="keywords" content="charset,iso,8859,9,charset,windows,1254,charset,utf,8,quot,kullanımı,hakkında,yardım, charset=iso-8859-9, charset=windows-1254, charset=utf-8" kullanımı hakkında yardım., Google Webmaster forum,seo board, webmaster tools, wordpress adsense, google optimizasyon, dedicated server, hosting"/> <meta name="description" content="Arkadaşlar öncelikle merhaba, bir sorum olucak benim haber sitemde <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-9" /> yazılımcı bunu kullanıyor ancak"/> """
h=open('html_encoding_tst.html','rb')
a=look_for_encoding(h.read())
print(a) # a is class bytes
print(a.decode('ascii'))
Messages parsing
This example is taken from the course Python: regular expressions at UIS - university of Cambridge. The goal is to extract the lines like >Jun 30 03:02:16 noether sshd[9515]: Invalid user gopher from 65.19.189.149
Jul 1 07:41:11 noether sshd[14506]: Invalid user test from 210.51.172.168
of the log /var/log/messages of a workstation subjected to hackers attacks. We want not only to extract these lines, but also to keep the date, username and IP of the attempts, eg as a list of tuples.
import re
import sys
pattern = r'''
^
([A-Z][a-z]{2})\s+ # Month
([123][0-9])\s+ # Day
(\d\d:\d\d:\d\d)\ # Time
noether\s+ sshd
\[\d+\]:\s+ # Process ID
Invalid\s+ user\
(\w+) # Fake user ID
\s+from\s+
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) # Host address
$
'''
regexp = re.compile(pattern, re.IGNORECASE+re.VERBOSE)
data = open('messages', 'r')
k=0
hacks=[]
for line in data:
match = regexp.search(line)
#print(line)
if match:
# print(line)
hacks.append((match.group(1)+' '+match.group(2)+' '+match.group(3),match.group(4),match.group(5)))
print(hacks[:10])
data.close()
It is even much easier to just launch a findall
over the whole file, instead of doing a loop and test over each line. It goes like this: >Note that we must use the MULTILINE flag, sice the regex pattern includes the ^ and
pattern = r'''
^
([A-Z][a-z]{2})\s+ # Month
([123][0-9])\s+ # Day
(\d\d:\d\d:\d\d)\ # Time
noether\s+ sshd
\[\d+\]:\s+ # Process ID
Invalid\s+ user\
(\w+) # Fake user ID
\s+from\s+
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) # Host address
$
'''
data = open('messages', 'r')
newregexp = re.compile(pattern, re.IGNORECASE+re.VERBOSE+re.MULTILINE)
out=newregexp.findall(data.read())
print(out[:10])
Substitution
An interesting application is also to substitute things according to a regex. As an illustration, we look for a simple conversion of bold and italic tags in a makdown text. In markdown, bold is indicated by **bold**
or __bold__
and italic by *italic*
or _italic_
. Therefore, the goal is to look for groups delimited by a pair of *
, **
, etc and then substitute them by another tagged version, e.g. an html tag.
txt="""This is a simple text in markdown, with **bold**, _italic_;
another __bold__ **and** *another italic*.
The goal is to convert this into an html version."""
txt="""This is a simple text in markdown, with **bold**, _italic_;
another __bold__ **and** *another italic*.
The goal is to convert this into an _html_ version."""
import re
p1='\*\*(?P<bf>.*)\*\*'
p2='__(?P<bf>.*)__'
p3='\*(?P<it>.*)\*'
p4='_(?P<it>.*)_'
txt=re.sub(p1,'<b>\g<bf></b>',txt,re.M)
txt=re.sub(p2,'<b>\g<bf></b>',txt,re.M)
txt=re.sub(p3,'<i>\g<it></i>',txt,re.M)
txt=re.sub(p4,'<i>\g<it></i>',txt,re.M)
print(txt)
Without named groups, it is needed to escape the groups backreferences, otherwise they are interpreted as octal character code escapes.
txt="""This is a simple text in markdown, with **bold**, _italic_;
another __bold__ **and** *another italic*.
The goal is to convert this into an _html_ version."""
import re
p1='\*\*(.*)\*\*'
p2='__(.*)__'
p3='\*(.*)\*'
p4='_(.*)_'
txt=re.sub(p1,'<b>\\1</b>',txt,re.M)
txt=re.sub(p2,'<b>\\1</b>',txt,re.M)
txt=re.sub(p3,'<i>\\1</i>',txt,re.M)
txt=re.sub(p4,'<i>\\1</i>',txt,re.M)
print(txt)
This is a more efficient implementation
txt="""This is a simple text in markdown, with **bold**, _italic_;
another __bold__ **and** *another italic*.
The goal is to convert this into an _html_ version."""
import re
p1='([_\*]{1})(.*)\\1'
p2='([_\*]{2})(.*)\\1' # match a single character present in the list two times
txt=re.sub(p2,'<i>\\2</i>',txt,re.M)
txt=re.sub(p1,'<b>\\2</b>',txt,re.M)
print(txt)
GPS location from a google-map page
While it is possible to directly use Google's Geocoding API, or even a dedicated python module like geopy. It is still amusing to extract the GPS location from a google maps webpage. This is easily done using a regex expression on the html response to a location request. This is the objective of the following script.
%%file coordGPS.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on May24, 2014
Last updated on July 19, 2014
@author: bercherj
"""
def coord(city,proxies={}):
"""
inputs:
------
city: string
name of the searched city
outputs:
-------
latlng: tuple
GPS coordinates (lat, lng)
"""
import urllib.parse
import urllib.request
import re
url = 'http://maps.google.fr/maps'
values = {'saddr' : str(city),
'f':'d'}
proxy_handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy_handler)
#opener = urllib.request.FancyURLopener(proxies)
response = opener.open(url+'/?'+urllib.parse.urlencode(values))
the_page = response.read().decode('latin1')
#the_page = response.read().decode('utf8')
latlng=re.findall('latlng:{lat:([-0-9.]*),lng:([-0-9.]*)},',the_page)
return latlng[0]
if __name__ == '__main__':
import argparse
whatitdoes="This program returns the GPS location given the name of a city. It uses the information given by google maps."
myself="(c) JFB"
parser = argparse.ArgumentParser(description=whatitdoes, epilog=myself)
parser.add_argument(
# no option eg '-o' '--output' # ==> mandatory argument
help = 'Name of cities to be processed',
dest = 'cities',
default = ['grenoble'],
type = str,
nargs = '*'
)
parser.add_argument(
'-v',
'--verbose',
help = 'Prints information',
dest = 'verbose',
default = False,
action='store_true'
)
parser.add_argument(
'-p',
'-proxy',
'--proxy',
help = 'Takes proxy configuration from "proxy.cfg"',
dest = 'proxy',
default = False,
action='store_true'
)
args = parser.parse_args()
print(args)
proxies={}
if args.proxy:
import configparser
config = configparser.ConfigParser()
s=config.read('proxy.cfg')
if s: #if the cfg file exists
proxies=dict(config['proxies'])
print(proxies)
for city in list(args.cities):
print(city)
ll=coord(city,proxies)
print(city+": Latitude: {0}, Longitude: {1}".format(ll[0],ll[1]) )
%run coordGPS.py -h
%run coordGPS.py Gérone
HTML(the_end(theNotebook))