Taken from:
https://www.machinelearningplus.com/python/python-regex-tutorial-examples
Main functions used in this tutorial:
*re.compile()
*re.split()
*re.findall()
*re.search()
*re.match()
*re.search().start()
*re.search().end()
*re.match().start()
*re.match().end()
*re.group()
### BASIC SYNTAX
# . One character except new line
# \. A period. \ escapes a special character.
# \d One digit
# \D One non-digit
# \w One word character including digits
# \W One non-word character
# \s One whitespace
# \S One non-whitespace
# \b Word boundary
# \n Newline
# \t Tab
### MODIFIERS
# $ End of string
# ^ Start of string
# ab|cd Matches ab or de.
# [ab-d] One character of: a, b, c, d
# [^ab-d] One character except: a, b, c, d
# () Items within parenthesis are retrieved
# (a(bc)) Items within the sub-parenthesis are retrieved
### REPETITIONS
# [ab]{2} Exactly 2 continuous occurrences of a or b
# [ab]{2,5} 2 to 5 continuous occurrences of a or b
# [ab]{2,} 2 or more continuous occurrences of a or b
# + One or more
# * Zero or more
# ? 0 or 1
### PYTHON REGEX FLAGS
# re.I re.IGNORECASE ignore case.
# re.M re.MULTILINE make begin/end {^, $} consider each line.
# re.S re.DOTALL make . match newline too.
# re.U re.UNICODE make {\w, \W, \b, \B} follow Unicode rules.
# re.L re.LOCALE make {\w, \W, \b, \B} follow locale.
# re.X re.VERBOSE allow comment in regex.
import re
### Ex2: Compile a regular expression pattern that can match at least one or more space characters
regex = re.compile('\s+')
### Ex3: Split a string with unequal spacing
text = """101 COM Computers
205 MAT Mathematics
189 ENG English"""
# version 1: using re.split()
re.split("\s+", text)
# version 2: using compiled pattern [better if used multiple times]
regex = re.compile('\s+')
regex.split(text)
# => if a particular pattern will be used multiple times, better to compile the regular expression
### Ex4: a) Find pattern matches using findall
# extract all the course numbers
text = """101 COM Computers
205 MAT Mathematics
189 ENG English"""
regex_nbr = re.compile('\d+')
regex_nbr.findall(text)
# ['101', '205', '189']
### Ex4: b) re.findall() vs. re.search() vs. re.match()
# re.findall() also searches for pattern in a text and returns matched positions as a list
# re.search() searches for pattern in a text and returns a "match" object containing starting and ending positions of the 1st occurence
# re.match() also returns a "match" object but requires the pattern to be present at the beginning of the text itself
regex_nbr.search(text)
# <_sre.SRE_Match object; span=(0, 3), match='101'>
regex_nbr.search(text).start()
# 0
regex_nbr.search(text).end()
# 3
regex_nbr.match(text)
# <_sre.SRE_Match object; span=(0, 3), match='101'>
regex_nbr.match(text).start()
# 0
regex_nbr.match(text).end()
# 3
# capture the matched pattern of the first occurence
text2 = """COM Computers
205 MAT Mathematics 189"""
text2_match = regex_nbr.search(text2)
text2[text2_match.start():text2_match.end()]
# '205
# or using re.group()
text2_match.group()
# '205
regex_nbr.match(text2)
print(regex_nbr.match(text2))
# None
### Ex5: substitution with regex with regex.sub()
# even out all the extra spaces and put all the words in one single line:
text = """101 COM \t Computers
205 MAT \t Mathematics
189 ENG \t English"""
re.sub("\s+", "", text)
# equivalent:
regex = re.compile("\s+")
regex.sub("", text)
# to keep new lines:
# -> can be done using negative lookahead (?!\n)
# => it checks for upcoming newline character and excludes it from the pattern
regex = re.compile("((?!\n)\s+)")
regex.sub("", text)
### Ex6: regex groups
# extract number, code and name as separate items
text = """101 COM Computers
205 MAT Mathematics
189 ENG English"""
# without groups, should be done separately
re.findall('[0-9]+', text) # extract the number
re.findall('[A-Z]{3}', text) # extract the code
re.findall('[A-Za-z]{4,}', text) # extract the name
# using a single pattern with groups
course_pattern = '([0-9]+)\s*([A-Z]{3})\s*([A-Za-z]{4,})'
re.findall(course_pattern, text)
# [('101', 'COM', 'Computers'),
# ('205', 'MAT', 'Mathematics'),
# ('189', 'ENG', 'English')]
### Ex7: greedy matching
# (greedy matching is the default behaviour)
text = "< body>Regex Greedy Matching Example < /body>"
re.findall('<.*>', text)
# ['< body>Regex Greedy Matching Example < /body>']
# lazy matching = take as little as possible:
re.findall('<.*?>', text)
# ['< body>', '< /body>']
# to extract only the first match:
re.search('<.*?>', text).group()
# '< body>'
### REGULAR EXPRESSIONS EXAMPLES
# 1) any character, except a new line
text = 'machinelearningplus.com'
re.findall('.', text) # . Any character except for a new line
#> ['m', 'a', 'c', 'h', 'i', 'n', 'e', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g', 'p', 'l', 'u', 's', '.', 'c', 'o', 'm']
re.findall('...', text)
#> ['mac', 'hin', 'ele', 'arn', 'ing', 'plu', 's.c']
# 2) match a period
re.findall("\.", text)
# 3) match anything but a period
re.findall("[^\.]", text)
# 4) match digits
text = '01, Jan 2015'
re.findall("\d", text) # match any digit
# ['0', '1', '2', '0', '1', '5']
re.findall("\d+", text) # match at least one digit
# ['01', '2015']
# 5) match anything but a digit
re.findall("\D", text)
# [',', ' ', 'J', 'a', 'n', ' ']
re.findall("\D+", text)
# [', Jan ']
# 6) match any character, including digits
re.findall("\w+", text)
# ['01', 'Jan', '2015']
# 7) match anything but a character
re.findall("\W+", text)
# 8) match collection of characters
re.findall('[a-zA-Z]+', text) # -> match any characters inside []
# 9) match something up to 'n' times
re.findall('\d{4}', text) # -> matches 4 digits
# ['2015']
re.findall('\d{2,4}', text) # -> matches 2 or 4 digits
# ['01', '2015']
# 10) match 1 or + occurences
re.findall(r'Co+l', 'So Cooool')
# NB: when an "r" or "R" prefix is present, a character following a backslash is included in the string without change
# print("\newline")
# ewline
# print(r"\newline")
# \newline
# 11) match any number of occurences
re.findall("Pi*lani", "Pilani")
# ['Pilani']
# 12) match exactly 0 or 1 occurence
re.findall("Pio?lani", "Pilani")
# ['Pilani']
re.findall("Pi?lani", "Pilani")
# ['Pilani']
# 13) match word boundary
re.findall(r"\btoy", "toy cat")
# ['toy']
re.findall(r"\btoy\b", "toy cat")
# ['toy']
re.findall(r"\btoy", "tolstoy")
# []
re.findall(r"toy\b", "tolstoy")
# ['toy']
### EXERCISES
# Q1. Extract the user id, domain name and suffix from the following email addresses.
emails = """zuck26@facebook.com
page33@google.com
jeff42@amazon.com"""
# solution 1
re.findall(r"(.+)@(.+)\.(.+)", emails)
# solution 2
match_pattern = r'(\w+)@([A-Z0-9]+)\.([A-Z]{2,4})'
re.findall(match_pattern, emails, flags=re.IGNORECASE)
# Q2. Retrieve all the words starting with ‘b’ or ‘B’ from the following text.
text = """Betty bought a bit of butter, But the butter was so bitter, So she bought some better butter, To make the bitter butter better."""
# solution 1
match_pattern = r"\bB.+?\b"
re.findall(match_pattern, text, flags = re.IGNORECASE)
# solution 2
match_pattern = r"\bB\w+"
re.findall(match_pattern, text, flags = re.IGNORECASE)
# Q3. Split the following irregular sentence into words
sentence = """A, very very; irregular_sentence"""
# match_pattern = r"\b\w+?\b"
# " ".join(re.findall(match_pattern, sentence))
# 'A very very irregular_sentence'
# => wrong, this does not discard "_"
" ".join(re.split('[;,\s_]+', sentence))
# 'A very very irregular sentence'
# Q4. Clean up the following tweet so that it contains only the user’s message.
# That is, remove all URLs, hashtags, mentions, punctuations, RTs and CCs.
tweet = '''Good advice! RT @TheNextWeb: What I would do differently if I was learning to code today http://t.co/lbwej0pxOd cc: @garybernhardt #rstats'''
def clean_tweet(tweet):
tweet = re.sub('http\S+\s*', '', tweet) # remove URLs
tweet = re.sub('RT|cc', '', tweet) # remove RT and cc
tweet = re.sub('#\S+', '', tweet) # remove hashtags
tweet = re.sub('@\S+', '', tweet) # remove mentions
tweet = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', tweet) # remove punctuations
tweet = re.sub('\s+', ' ', tweet) # remove extra whitespace
return tweet
clean_tweet(tweet)
# 'Good advice What I would do differently if I was learning to code today '
# Q5. Extract all the text portions between the tags from the following HTML page:
# https://raw.githubusercontent.com/selva86/datasets/master/sample.html
# Code to retrieve the HTML page:
import requests
r = requests.get("https://raw.githubusercontent.com/selva86/datasets/master/sample.html")
r.text # html text is contained here
re.findall('<.*?>(.*)</.*?>', r.text)