"""Exposes several SGMLParser subclasses. This work, including the source code, documentation and related data, is placed into the public domain. The orginal author is Robert Brewer. THIS SOFTWARE IS PROVIDED AS-IS, WITHOUT WARRANTY OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE RESULTING FROM THE USE, MODIFICATION, OR REDISTRIBUTION OF THIS SOFTWARE. If you don't need thread-safety, you might create a single instance of the parser you want, and feed it yourself. You also might use the classes directly if you need to customize them in some way; for example, you may need to alter the list of unsafe_tags in the Sanitizer class, either per-instance or by subclassing it. If you need thread-safe parsing, you should use the functions provided. They create a new instance each time, so you get a *small* performance hit, but by the same token, each thread can work on its own instance. """ import re import sgmllib import htmlentitydefs from xml.sax.saxutils import quoteattr interesting = re.compile('[&<]') incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' '<([a-zA-Z][^<>]*|' '/([a-zA-Z][^<>]*)?|' '![^<>]*)?') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#([0-9]+)[^0-9]') starttagopen = re.compile('<[>a-zA-Z]') class MoreReasonableSGMLParser(sgmllib.SGMLParser): """Just like an SGML Parser, but with more information passed to the handle_ methods. For example, handle_entityref passes the whole match, ampersand, name, and trailer.""" # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if self.nomoretags: self.handle_data(rawdata[i:n]) i = n break match = interesting.search(rawdata, i) if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) i = j if i == n: break if rawdata[i] == '<': if starttagopen.match(rawdata, i): if self.literal: self.handle_data(rawdata[i]) i = i+1 continue k = self.parse_starttag(i) if k < 0: break i = k continue if rawdata.startswith(" (i + 1): self.handle_data("<") i = i+1 else: # incomplete break continue if rawdata.startswith("" javascript = r"""(?i)href\w*=['"]javascript:""" unsafe_attributes = [u'abort', u'blur', u'change', u'click', 'dblclick', u'error', u'focus', u'keydown', u'keypress', u'keyup', u'load', u'mousedown', u'mouseout', u'mouseover', u'mouseup', u'reset', u'resize', u'submit', u'unload', ] empty_tags = [u'area', u'base', u'basefont', u'br', u'hr', u'img', u'input', u'link', u'meta', u'param', ] def handle_data(self, data): self.result.append(data) def handle_charref(self, ref): self.result.append('&#' + ref + ';') def handle_entityref(self, ref, trailer): self.result.append('&' + ref + trailer) def handle_decl(self, data): tag = data.split(" ")[0].lower() if ("!" + tag) in self.unsafe_tags: self.result.append(self.replacement) else: self.result.append(u'') def unknown_starttag(self, tag, attributes): if tag in self.unsafe_tags: self.result.append(self.replacement) else: attrs = [] for name, value in attributes: if name not in self.unsafe_attributes: attrs.append(' ' + name + '=' + quoteattr(value)) if tag in self.empty_tags: tail = ' />' else: tail = '>' self.result.append('<' + tag + ''.join(attrs) + tail) def unknown_endtag(self, tag): if tag in self.unsafe_tags: self.result.append(self.replacement) else: if tag not in self.empty_tags: self.result.append('') def sanitize(content): """Strips specific HTML tags from content. Entities are retained.""" s = Sanitizer() s.result = [] s.feed(content) s.close() return u"".join(s.result)