正規表現の使い方メモ

perlとかrubyと比べて若干使いにくいpythonの正規表現についてメモ。
とりあえずはmatchじゃなくてsearchを使う。
繰り返しが入る場合は、findall。
場合によってはsubをコールバックと一緒に使う。

import re
# -*- encoding: utf-8 -*- 

html="""
<html>
<head>
<title>サンプル</title>
</head>
<body>
<H1 id='title' class="bold">タイトル</H1>
<ul class=hoge>
  <li>AB
  CD</li>
  <li>DEFG</li>
  <li>HIJK</li>
  <li>LMNO</li>
</ul>
</body>
<html>
"""

# <h1>タグの文字を抜き出そうとしている
title_pattern='<h1[^>]*>(.*)</h1>'

# matchは文字列の先頭としかマッチしない
m=re.match(title_pattern, html)
print m
print

# 大文字小文字の不一致
m=re.search(title_pattern, html)
print m
print

# マッチ。groupで()部分を取得
m=re.compile(title_pattern, re.IGNORECASE).search(html)
print m
print m.group(1)
print

################################################################################
# <li>タグを全部とりたい
list_pattern='(<li>(.*?)</li>)'

# .が改行にマッチしない
m=re.findall(list_pattern, html)
print m
print

# マッチ
list_pattern='(<li>(.*?)</li>)'
m=re.compile(list_pattern, re.DOTALL).findall(html)
print m
print

# ついでにコールバックによる置換
def callback(match):
  print match.group(2)
  return "#%s#" % match.group(2)
print re.sub(list_pattern, callback, html)