diff --git a/sylk/applications/xmppgateway/util.py b/sylk/applications/xmppgateway/util.py index cac0927..720d4a7 100644 --- a/sylk/applications/xmppgateway/util.py +++ b/sylk/applications/xmppgateway/util.py @@ -1,36 +1,32 @@ # Copyright (C) 2012 AG Projects. See LICENSE for details # -from cStringIO import StringIO -from formatter import AbstractFormatter, DumbWriter -from htmllib import HTMLParser, HTMLParseError +import lxml.html +import lxml.html.clean __all__ = ['html2text', 'text2html', 'format_uri'] def html2text(data): - # Based on http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python - f = StringIO() - parser = HTMLParser(AbstractFormatter(DumbWriter(f))) try: - parser.feed(data) - except HTMLParseError: + doc = lxml.html.document_fromstring(data) + cleaner = lxml.html.clean.Cleaner(style=True) + doc = cleaner.clean_html(doc) + return doc.text_content().strip('\n') + except Exception: return '' - else: - parser.close() - return f.getvalue() xhtml_im_template = """ %(data)s """ def text2html(data): return xhtml_im_template % {'data': data} def format_uri(uri, scheme=''): return '%s%s@%s' % ('' if not scheme else scheme+':', uri.user, uri.host)