summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bzrignore4
-rwxr-xr-xadmin_cb.py141
-rw-r--r--docs/admin.html78
-rw-r--r--docs/config.html13
-rw-r--r--docs/contributing.html34
-rw-r--r--docs/index.html1
-rw-r--r--docs/templates.html6
-rw-r--r--favicon.py79
-rw-r--r--filters/mememe.plugin18
-rw-r--r--filters/minhead.py2
-rwxr-xr-xplanet.py31
-rw-r--r--planet/__init__.py4
-rw-r--r--planet/config.py29
-rwxr-xr-xplanet/csv_config.py3
-rw-r--r--planet/publish.py26
-rw-r--r--planet/reconstitute.py35
-rw-r--r--planet/scrub.py25
-rw-r--r--planet/shell/__init__.py2
-rw-r--r--planet/shell/_genshi.py16
-rw-r--r--planet/shell/dj.py4
-rw-r--r--planet/shell/tmpl.py3
-rw-r--r--planet/shell/xslt.py2
-rw-r--r--planet/spider.py20
-rw-r--r--planet/splice.py26
-rwxr-xr-xplanet/vendor/feedparser.py25
-rw-r--r--planet/vendor/html5lib/__init__.py7
-rw-r--r--planet/vendor/html5lib/constants.py53
-rw-r--r--planet/vendor/html5lib/html5parser.py565
-rw-r--r--planet/vendor/html5lib/ihatexml.py51
-rw-r--r--planet/vendor/html5lib/inputstream.py97
-rw-r--r--planet/vendor/html5lib/sanitizer.py148
-rw-r--r--planet/vendor/html5lib/serializer/htmlserializer.py68
-rw-r--r--planet/vendor/html5lib/tokenizer.py743
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/__init__.py17
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/_base.py50
-rw-r--r--planet/vendor/html5lib/treebuilders/dom.py46
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/etree.py11
-rw-r--r--planet/vendor/html5lib/treebuilders/etree_lxml.py32
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/simpletree.py34
-rw-r--r--planet/vendor/html5lib/treebuilders/soup.py19
-rw-r--r--planet/vendor/html5lib/treewalkers/_base.py18
-rw-r--r--planet/vendor/html5lib/treewalkers/dom.py1
-rw-r--r--planet/vendor/html5lib/treewalkers/genshistream.py6
-rw-r--r--planet/vendor/html5lib/treewalkers/lxmletree.py6
-rw-r--r--planet/vendor/html5lib/treewalkers/soup.py9
-rw-r--r--planet/vendor/html5lib/utils.py21
-rw-r--r--planet/vendor/httplib2/__init__.py3
-rw-r--r--planet/vendor/pubsubhubbub_publisher/PKG-INFO10
-rw-r--r--planet/vendor/pubsubhubbub_publisher/__init__.py2
-rw-r--r--planet/vendor/pubsubhubbub_publisher/pubsubhubbub_publish.py77
-rwxr-xr-xpublish.py17
-rw-r--r--tests/data/config/basic.csv2
-rw-r--r--tests/data/config/basic.ini1
-rw-r--r--tests/data/filter/django/test.xml2
-rw-r--r--tests/data/reconstitute/content_illegal_char.xml2
-rw-r--r--tests/data/reconstitute/dc_date_taken.xml12
-rw-r--r--tests/data/reconstitute/georss_box_latlong.xml10
-rw-r--r--tests/data/reconstitute/georss_placeboxpolygon_latlong.xml15
-rw-r--r--tests/data/reconstitute/georss_point_latlong.xml11
-rw-r--r--tests/data/reconstitute/georss_polygon_latlong.xml10
-rw-r--r--tests/data/reconstitute/gr_id.xml11
-rw-r--r--tests/data/reconstitute/rss_source.xml2
-rw-r--r--tests/data/spider/config.ini1
-rw-r--r--tests/reconstitute.py3
-rw-r--r--tests/test_config.py3
-rw-r--r--tests/test_docs.py15
-rw-r--r--tests/test_filter_django.py12
-rw-r--r--tests/test_reconstitute.py3
-rw-r--r--tests/test_spider.py21
-rw-r--r--themes/asf/default.css5
-rw-r--r--themes/asf/index.html.xslt13
-rw-r--r--themes/asf/personalize.js8
-rw-r--r--themes/classic_fancy/index.html.tmpl6
-rw-r--r--themes/common/admin.html.tmpl41
-rw-r--r--themes/common/rss10.xml.tmpl7
-rw-r--r--themes/common/rss20.xml.tmpl8
-rw-r--r--themes/django/index.html.dj6
-rw-r--r--themes/genshi_fancy/index.html.genshi6
78 files changed, 2197 insertions, 777 deletions
diff --git a/.bzrignore b/.bzrignore
deleted file mode 100644
index a8f0629..0000000
--- a/.bzrignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*.tmplc
-.DS_Store
-cache
-*.pluginc
diff --git a/admin_cb.py b/admin_cb.py
new file mode 100755
index 0000000..63315e1
--- /dev/null
+++ b/admin_cb.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import cgi
+import cgitb
+cgitb.enable()
+
+from urllib import unquote
+import sys, os
+
+# Modify this to point to where you usually run planet.
+BASE_DIR = '..'
+
+# Modify this to point to your venus installation dir, relative to planet dir above.
+VENUS_INSTALL = "venus"
+
+# Config file, relative to planet dir above
+CONFIG_FILE = "config/live"
+
+# Admin page URL, relative to this script's URL
+ADMIN_URL = "admin.html"
+
+
+# chdir to planet dir - config may be relative from there
+os.chdir(os.path.abspath(BASE_DIR))
+
+# Add venus to path.
+sys.path.append(VENUS_INSTALL)
+
+# Add shell dir to path - auto detection does not work
+sys.path.append(os.path.join(VENUS_INSTALL, "planet", "shell"))
+
+# import necessary planet items
+from planet import config
+from planet.spider import filename
+
+
+# Load config
+config.load(CONFIG_FILE)
+
+# parse query parameters
+form = cgi.FieldStorage()
+
+
+# Start HTML output at once
+print "Content-Type: text/html;charset=utf-8" # HTML is following
+print # blank line, end of headers
+
+
+print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
+print '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sv"><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>Admin results</title></head><body>'
+print '<div>'
+
+# Cache and blacklist dirs
+
+cache = config.cache_directory()
+blacklist = config.cache_blacklist_directory()
+
+# Must have command parameter
+if not "command" in form:
+ print "<p>Unknown command</p>"
+
+elif form['command'].value == "blacklist":
+
+
+ # Create the blacklist dir if it does not exist
+ if not os.path.exists(blacklist):
+ os.mkdir(blacklist)
+ print "<p>Created directory %s</p>" % blacklist
+
+ # find list of urls, in the form bl[n]=url
+
+ for key in form.keys():
+
+ if not key.startswith("bl"): continue
+
+ url = unquote(form[key].value)
+
+ # find corresponding files
+ cache_file = filename(cache, url)
+ blacklist_file = filename(blacklist, url)
+
+ # move to blacklist if found
+ if os.path.exists(cache_file):
+
+ os.rename(cache_file, blacklist_file)
+
+ print "<p>Blacklisted <a href='%s'>%s</a></p>" % (url, url)
+
+ else:
+
+ print "<p>Unknown file: %s</p>" % cache_file
+
+ print """
+<p>Note that blacklisting does not automatically
+refresh the planet. You will need to either wait for
+a scheduled planet run, or refresh manually from the admin interface.</p>
+"""
+
+
+elif form['command'].value == "run":
+
+ # run spider and refresh
+
+ from planet import spider, splice
+ try:
+ spider.spiderPlanet(only_if_new=False)
+ print "<p>Successfully ran spider</p>"
+ except Exception, e:
+ print e
+
+ doc = splice.splice()
+ splice.apply(doc.toxml('utf-8'))
+
+elif form['command'].value == "refresh":
+
+ # only refresh
+
+ from planet import splice
+
+ doc = splice.splice()
+ splice.apply(doc.toxml('utf-8'))
+
+ print "<p>Successfully refreshed</p>"
+
+elif form['command'].value == "expunge":
+
+ # only expunge
+ from planet import expunge
+ expunge.expungeCache()
+
+ print "<p>Successfully expunged</p>"
+
+
+
+
+print "<p><strong><a href='" + ADMIN_URL + "'>Return</a> to admin interface</strong></p>"
+
+
+
+print "</body></html>"
diff --git a/docs/admin.html b/docs/admin.html
new file mode 100644
index 0000000..811bd60
--- /dev/null
+++ b/docs/admin.html
@@ -0,0 +1,78 @@
+<!DOCTYPE html PUBLIC
+ "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"
+ "http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<script type="text/javascript" src="docs.js"></script>
+<link rel="stylesheet" type="text/css" href="docs.css"/>
+<title>Administration interface</title>
+</head>
+<body>
+<h2>Administration interface</h2>
+<p>Venus comes with a basic administration interface, allowing you to manually run planet, do a refresh from cache, expunge the cache or blacklist individual entries from the planet.</p>
+
+<h3>Using the administration interface</h3>
+
+<p>The administration interface allows you to manage the everyday tasks related to your venus installation.</p>
+
+<ul><li><strong>Running planet</strong>. By clicking the "Run planet" button, you can do a full run of the planet script, rechecking all the feeds and recreating the generated files. This corresponds to running <code>python planet.py config.ini</code> with no arguments. Note that, depending on the numer of feeds, this operation may take some time.</li>
+<li><strong>Refreshing planet</strong>. By clicking the "Refresh planet" button, you can do an "offline" run of the planet script, without rechecking all the feeds but still recreating the generated files. This corresponds to running <code>python planet.py -o config.ini</code>.</li>
+<li><strong>Expunging the planet cache</strong>. By clicking the "Expunge cache" button, you can clean the cache from outdated entries. This corresponds to running <code>python planet.py -x config.ini</code>.</li>
+<li><strong>Blacklisting</strong>. By selecting one or more of the entries in the list of entries, and clicking the "Blacklist" button, you can stop these items from displaying on the planet. This is very useful for quickly blocking inappropriate or malformed content from your planet. <i>Note that blacklisting does not take effect until you refresh or rerun the planet</i>. (Blacklisting can also be done manually on the server by moving files from the cache directory to the blacklist directory.)</li>
+</ul>
+
+<p>Installing the administration interface securely requires some knowledge of web server configuration.</p>
+
+<p>The admin interface consists of two parts: the admin template file and the server callback script. Both must be correctly installed for the administration interface to work.</p>
+
+<h3>Installing the admin template</h3>
+
+The admin page template is found in <code>themes/common/admin.html.tmpl</code>. This template needs to be added to your config file along with your other templates, and optionally customized. Make sure that <code>action="admin_cb.py"</code> found in several places in the file points to the URL (or relative URL) of the admin callback script below.
+
+<h3>Installing the admin callback script</h3>
+
+<p>The admin callback script, admin_cb.py, needs to be copied to somewhere among your web server files. Depending on the details of your web server, your permissions, etc., this can be done in several different ways and in different places. There are three steps involved:</p>
+<ol><li>Configuring the script</li>
+<li>Enabling CGI</li>
+<li>Secure access</li></ol>
+
+
+<h4>Configuring the script</h4>
+
+<p>At the top of the script, there are four variables you must customize. The correct values of the first three variables can be found by analyzing how you normally run the <code>planet.py</code> script. If you typically run planet from within the working directory <code>BASE_DIR</code>, using a command like <blockquote><code>python [VENUS_INSTALL]/planet.py [CONFIG_FILE]</code></blockquote> you know all three values.</p>
+
+<dl><dt><code>BASE_DIR</code></dt><dd>
+This variable must contain the directory from where you usually run the planet.py script, to ensure that relative file names in the config files work correctly.</dd>
+<dt><code>VENUS_INSTALL</code></dt><dd>
+This variable must contain your venus installation directory, relative to BASE_DIR above.</dd>
+<dt><code>CONFIG_FILE</code></dt><dd>
+This variable must contain your configuration file, relative to BASE_DIR above.</dd>
+<dt><code>ADMIN_URL</code></dt><dd>
+This variable must contain the URL (or relative URL) of the administration page, relative to this script's URL.</dd>
+</dl>
+
+<h4>Enabling CGI</h4>
+
+<p>You will need to ensure that it can be run as a CGI script. This is done differently on different web server platforms, but there are at least three common patterns</p>
+
+<ul><li><b>Apache with <code>.htaccess</code></b>. If your server allows you to use <code>.htaccess</code> files, you can simply add
+<blockquote><code>Options +ExecCGI<br />
+AddHandler cgi-script .py</code></blockquote>
+in an .htaccess file in the planet output directory to enable the server to run the script. In this case, the admin_cb.py file can be put alongside the rest of the planet output files.
+</li>
+<li><b>Apache without <code>.htaccess</code></b>. If your server does not allow you to add CGI handlers to <code>.htaccess</code> files, you can add
+<blockquote><code>Options +ExecCGI<br />
+AddHandler cgi-script .py</code></blockquote>
+to the relevant part of the central apache configuration files.
+</li>
+<li><b>Apache with cgi-bin</b>. If your server only allow CGI handlers in pre-defined directories, you can place the <code>admin_cb.py</code> file there, and make sure to update the <code>action="admin_cb.py"</code> code in the template file <code>admin.html.tmpl</code>, as well as the <code>ADMIN_URL</code> in the callback script.
+</li>
+</ul>
+
+<p>In all cases, it is necessary to make sure that the script is executed as the same user that owns the planet output files and the cache. Either the planet output is owned by the apache user (usually <code>www-data</code>), or Apache's <a href="http://httpd.apache.org/docs/2.0/suexec.html">suexec</a> feature can be used to run the script as the right user.</p>
+
+<h4>Securing the admin interface</h4>
+<p>If you don't want every user to be able to administrate your planet, you must secure at least the <code>admin_cb.py</code> file, and preferrably the <code>admin.html</code> file as well. This can be done using your web server's regular access control features. See <a href="http://httpd.apache.org/docs/2.0/howto/auth.html">here</a> for Apache documentation.</p>
+
+</body>
+</html>
diff --git a/docs/config.html b/docs/config.html
index 0ed6e59..ee6cf45 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -118,6 +118,19 @@ cache. If specified as a relative path, it is evaluated relative to the
<dd>Used by <code>expunge</code> to determine how many entries should be
kept for each source when expunging old entries from the cache directory.
This may be overriden on a per subscription feed basis.</dd>
+<dt><ins>pubsubhubbub_hub</ins></dt>
+<dd>URL to a PubSubHubbub hub, for example <a
+href="http://pubsubhubbub.appspot.com">http://pubsubhubbub.appspot.com</a>.
+Used by <code>publish</code> to ping the
+hub when feeds are published, speeding delivery of updates to
+subscribers. See
+the <a href="http://code.google.com/p/pubsubhubbub/"> PubSubHubbub
+home page</a> for more information.</dd>
+<dt><ins>pubsubhubbub_feeds</ins></dt>
+<dd>List of feeds to publish. Defaults to <code>atom.xml rss10.xml
+rss20.xml</code>.</dd>
+<dt id="django_autoescape"><ins>django_autoescape</ins></dt>
+<dd>Control <a href="http://docs.djangoproject.com/en/dev/ref/templates/builtins/#autoescape">autoescaping</a> behavior of django templates. Defaults to <code>on</code>.</dd>
</dl>
<p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
diff --git a/docs/contributing.html b/docs/contributing.html
index 2cf95e1..42e8835 100644
--- a/docs/contributing.html
+++ b/docs/contributing.html
@@ -1,6 +1,4 @@
-<!DOCTYPE html PUBLIC
- "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"
- "http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd">
+<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<script type="text/javascript" src="docs.js"></script>
@@ -22,46 +20,46 @@ contribution.</p>
<p>Documentation can be found in the <code>docs</code> directory. It is
straight XHTML.</p>
<p>Test cases can be found in the
-<a href="http://localhost/~rubys/venus/tests/">tests</a> directory, and
+<a href="http://intertwingly.net/code/venus/tests/">tests</a> directory, and
make use of the
<a href="http://docs.python.org/lib/module-unittest.html">Python Unit testing framework</a>. To run them, simply enter:</p>
<blockquote><pre>python runtests.py</pre></blockquote>
-<h3>Bzr</h3>
-<p>If you have done a <a href="index.html">bzr get</a>, you have already set up
+<h3>Git</h3>
+<p>If you have done a <a href="index.html">git pull</a>, you have already set up
a repository. The only additional step you might need to do is to introduce
-yourself to <a href="http://bazaar-vcs.org/">bzr</a>. Type in the following,
+yourself to <a href="http://git-scm.com/">git</a>. Type in the following,
after replacing the <b>bold text</b> with your information:</p>
-<blockquote><pre>bzr whoami '<b>Your Name</b> &lt;<b>youremail</b>@<b>example.com</b>&gt;'</pre></blockquote>
+<blockquote><pre>git config --global user.name '<b>Your Name</b>'
+git config --global user.email '<b>youremail</b>@<b>example.com</b>'</pre></blockquote>
<p>Then, simply make the changes you like. When you are done, type:</p>
-<blockquote><pre>bzr st</pre></blockquote>
+<blockquote><pre>git status</pre></blockquote>
<p>This will tell you which files you have modified, and which ones you may
have added. If you add files and you want them to be included, simply do a:</p>
-<blockquote><pre>bzr add file1 file2...</pre></blockquote>
+<blockquote><pre>git add file1 file2...</pre></blockquote>
-<p>You can also do a <code>bzr diff</code> to see if there are any changes
+<p>You can also do a <code>git diff</code> to see if there are any changes
which you made that you don't want included. I can't tell you how many
debug print statements I have caught this way.</p>
<p>Next, type:</p>
-<blockquote><pre>bzr commit</pre></blockquote>
+<blockquote><pre>git commit -a</pre></blockquote>
<p>This will allow you to enter a comment describing your change. If your
repository is already on your web server, simple let others know where they
-can find it. If not, you can simply ftp or scp the files to your web server
-&mdash; no additional software needs to be installed on that machine.</p>
+can find it. If not, consider using <a href="">github</a> to host your
+<a href="http://help.github.com/forking/">fork</a> of Venus.</p>
<h3>Telling others</h3>
<p>Once you have a change worth sharing, post a message on the
-<a href="http://lists.planetplanet.org/mailman/listinfo/devel">mailing list</a>.</p>
-<p>Also, consider setting up a <a href="http://bzr.mfd-consult.dk/bzr-feed/">bzr-feed</a> for your repository, so people who wish to do so can automatically
-be notified of every change.</p>
-<p>There now is even an nascent <a href="http://planet.intertwingly.net/venus/">planet</a> being formed which combines these feeds of changes. You can <a href="http://planet.intertwingly.net/venus/atom.xml">subscribe</a> to it too.</p>
+<a href="http://lists.planetplanet.org/mailman/listinfo/devel">mailing
+list</a>, or use github to send a <a
+href="http://github.com/guides/pull-requests">pull request</a>.</p>
</body>
</html>
diff --git a/docs/index.html b/docs/index.html
index c461d7f..051eb64 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -22,6 +22,7 @@
<li><a href="venus.svg">Architecture</a></li>
<li><a href="normalization.html">Normalization</a></li>
<li><a href="filters.html">Filters and Plugins</a></li>
+<li><a href="admin.html">Administration interface</a></li>
</ul>
</li>
<li>Other
diff --git a/docs/templates.html b/docs/templates.html
index b9fd9c1..5549901 100644
--- a/docs/templates.html
+++ b/docs/templates.html
@@ -143,6 +143,12 @@ Item.</p>
requires at least Python 2.3.
</p>
+<p>
+ The <a href="config.html#django_autoescape">django_autoescape</a> config
+ option may be used to globally set the default value for
+ <a href="http://docs.djangoproject.com/en/dev/ref/templates/builtins/#autoescape">auto-escaping</a>.
+</p>
+
<h3>xslt</h3>
<p><a href="http://www.w3.org/TR/xslt">XSLT</a> is a paradox: it actually
makes some simple things easier to do than htmltmpl, and certainly can
diff --git a/favicon.py b/favicon.py
new file mode 100644
index 0000000..2e351a3
--- /dev/null
+++ b/favicon.py
@@ -0,0 +1,79 @@
+import sys, socket
+from planet import config, feedparser
+from planet.spider import filename
+from urllib2 import urlopen
+from urlparse import urljoin
+from html5lib import html5parser, treebuilders
+from ConfigParser import ConfigParser
+
+# load config files (default: config.ini)
+for arg in sys.argv[1:]:
+ config.load(arg)
+if len(sys.argv) == 1:
+ config.load('config.ini')
+
+from Queue import Queue
+from threading import Thread
+
+# determine which subscriptions have no icon but do have a html page
+fetch_queue = Queue()
+html = ['text/html', 'application/xhtml+xml']
+sources = config.cache_sources_directory()
+for sub in config.subscriptions():
+ data=feedparser.parse(filename(sources,sub))
+ if data.feed.get('icon'): continue
+ if not data.feed.get('links'): continue
+ for link in data.feed.links:
+ if link.rel=='alternate' and link.type in html:
+ fetch_queue.put((sub, link.href))
+ break
+
+# find the favicon for a given webpage
+def favicon(page):
+ parser=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc=parser.parse(urlopen(page))
+ favicon = urljoin(page, '/favicon.ico')
+ for link in doc.getElementsByTagName('link'):
+ if link.hasAttribute('rel') and link.hasAttribute('href'):
+ if 'icon' in link.attributes['rel'].value.lower().split(' '):
+ favicon = urljoin(page, link.attributes['href'].value)
+ if urlopen(favicon).info()['content-length'] != '0':
+ return favicon
+
+# thread worker that fills in the dictionary which maps subs to favicon
+icons = {}
+def fetch(thread_index, fetch_queue, icons):
+ while 1:
+ sub, html = fetch_queue.get()
+ if not html: break
+ try:
+ icon = favicon(html)
+ if icon: icons[sub] = icon
+ except:
+ pass
+
+# set timeout
+try:
+ socket.setdefaulttimeout(float(config.feed_timeout()))
+except:
+ pass
+
+# (optionally) spawn threads, fetch pages
+threads = {}
+if int(config.spider_threads()):
+ for i in range(int(config.spider_threads())):
+ threads[i] = Thread(target=fetch, args=(i, fetch_queue, icons))
+ fetch_queue.put((None, None))
+ threads[i].start()
+ for i in range(int(config.spider_threads())):
+ threads[i].join()
+else:
+ fetch_queue.put((None, None))
+ fetch(0, fetch_queue, icons)
+
+# produce config file
+config = ConfigParser()
+for sub, icon in icons.items():
+ config.add_section(sub)
+ config.set(sub, 'favicon', icon)
+config.write(sys.stdout)
diff --git a/filters/mememe.plugin b/filters/mememe.plugin
index 273c02c..8073a27 100644
--- a/filters/mememe.plugin
+++ b/filters/mememe.plugin
@@ -18,10 +18,15 @@
# sidebar = @class='sidebar'
#
-import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
+import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re
from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
+try:
+ from hashlib import md5
+except:
+ from md5 import new as md5
+
import planet
from planet import config
from planet.spider import filename
@@ -209,6 +214,7 @@ class html(sgmllib.SGMLParser):
self.feedurl = ""
self.intitle = False
+ url = url.split('#')[0]
headers = check_cache(url)
try:
@@ -380,6 +386,7 @@ from urllib import quote_plus
for i in range(0,len(weighted_links)):
weight, link, updated = weighted_links[i]
if link in spam: continue
+ if unique_votes(all_links[link]) < 2: continue
# ensure that somebody new points to this entry. This guards against
# groups of related links which several posts point to all.
@@ -405,6 +412,7 @@ for i in range(0,len(weighted_links)):
# otherwise, parse the html
if not title:
title = html(revmap.get(link,link)).title
+ if not title: title = link.strip('/').split('/')[-1]
# dehtmlize
title = re.sub('&(\w+);',
@@ -413,7 +421,7 @@ for i in range(0,len(weighted_links)):
title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
# title too long? Insert zero width spaces where appropriate
- if max(map(len,title.split())) > 30:
+ if len(title.strip())>0 and max(map(len,title.split())) > 30:
title=re.sub('(\W+)',u'\\1\u200b',title)
# save the entry title (it is used later)
@@ -467,7 +475,7 @@ for i in range(0,len(weighted_links)):
tagbase = config.link().split('/')
if not tagbase[-1]: tagbase = tagbase[:-1]
tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
- entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
+ entry.newTextChild(None, 'id', tagbase % md5(link).hexdigest())
entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
meme_link = entry.newTextChild(None, 'link', None)
meme_link.setProp('href', link)
@@ -488,6 +496,10 @@ for i in range(0,len(weighted_links)):
count = count + 1
if count >= 10: break
+# remove ul when there are no memes
+if memes_ul.lsCountNode() < 1:
+ memes_ul.unlinkNode()
+
log.info("Writing " + MEMES_ATOM)
output=open(MEMES_ATOM,'w')
output.write(feed_doc.serialize('utf-8'))
diff --git a/filters/minhead.py b/filters/minhead.py
index b9c225e..056481b 100644
--- a/filters/minhead.py
+++ b/filters/minhead.py
@@ -28,7 +28,7 @@ if first < minhead:
for i in range(6,0,-1):
for oldhead in doc.getElementsByTagName('h%d' % i):
newhead = doc.createElementNS(XHTML_NAMESPACE, 'h%d' % (i+minhead-first))
- for child in oldhead.childNodes:
+ for child in oldhead.childNodes[:]:
newhead.appendChild(child)
oldhead.parentNode.replaceChild(newhead, oldhead)
diff --git a/planet.py b/planet.py
index c278c06..26191bb 100755
--- a/planet.py
+++ b/planet.py
@@ -17,11 +17,13 @@ __license__ = "Python"
import os, sys
if __name__ == "__main__":
- config_file = "config.ini"
+ config_file = []
offline = 0
verbose = 0
only_if_new = 0
expunge = 0
+ debug_splice = 0
+ no_publish = 0
for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help":
@@ -33,6 +35,7 @@ if __name__ == "__main__":
print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds"
print " -x, --expunge Expunge old entries from cache"
+ print " --no-publish Do not publish feeds using PubSubHubbub"
print
sys.exit(0)
elif arg == "-v" or arg == "--verbose":
@@ -43,14 +46,18 @@ if __name__ == "__main__":
only_if_new = 1
elif arg == "-x" or arg == "--expunge":
expunge = 1
+ elif arg == "-d" or arg == "--debug-splice":
+ debug_splice = 1
+ elif arg == "--no-publish":
+ no_publish = 1
elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg
sys.exit(1)
else:
- config_file = arg
+ config_file.append(arg)
from planet import config
- config.load(config_file)
+ config.load(config_file or 'config.ini')
if verbose:
import planet
@@ -65,8 +72,26 @@ if __name__ == "__main__":
from planet import splice
doc = splice.splice()
+
+ if debug_splice:
+ from planet import logger
+ logger.info('writing debug.atom')
+ debug=open('debug.atom','w')
+ try:
+ from lxml import etree
+ from StringIO import StringIO
+ tree = etree.tostring(etree.parse(StringIO(doc.toxml())))
+ debug.write(etree.tostring(tree, pretty_print=True))
+ except:
+ debug.write(doc.toprettyxml(indent=' ', encoding='utf-8'))
+ debug.close
+
splice.apply(doc.toxml('utf-8'))
+ if config.pubsubhubbub_hub() and not no_publish:
+ from planet import publish
+ publish.publish(config)
+
if expunge:
from planet import expunge
expunge.expungeCache
diff --git a/planet/__init__.py b/planet/__init__.py
index f90dfe9..61c2cb1 100644
--- a/planet/__init__.py
+++ b/planet/__init__.py
@@ -36,5 +36,7 @@ sys.path.insert(1, os.path.join(os.path.dirname(__file__),'vendor'))
# Configure feed parser
import feedparser
-feedparser.SANITIZE_HTML=0
+feedparser.SANITIZE_HTML=1
feedparser.RESOLVE_RELATIVE_URIS=0
+
+import publish
diff --git a/planet/config.py b/planet/config.py
index ba9821c..5295e62 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -105,6 +105,8 @@ def __init__():
define_planet('output_theme', '')
define_planet('output_dir', 'output')
define_planet('spider_threads', 0)
+ define_planet('pubsubhubbub_hub', '')
+ define_planet_list('pubsubhubbub_feeds', 'atom.xml rss10.xml rss20.xml')
define_planet_int('new_feed_items', 0)
define_planet_int('feed_timeout', 20)
@@ -116,6 +118,7 @@ def __init__():
define_planet_list('bill_of_materials')
define_planet_list('template_directories', '.')
define_planet_list('filter_directories')
+ define_planet('django_autoescape', 'on')
# template options
define_tmpl_int('days_per_page', 0)
@@ -134,11 +137,11 @@ def __init__():
define_tmpl('filter', None)
define_tmpl('exclude', None)
-def load(config_file):
+def load(config_files):
""" initialize and load a configuration"""
global parser
parser = ConfigParser()
- parser.read(config_file)
+ parser.read(config_files)
import config, planet
from planet import opml, foaf, csv_config
@@ -157,8 +160,11 @@ def load(config_file):
dirs = config.template_directories()
if theme_dir not in dirs:
dirs.append(theme_dir)
- if os.path.dirname(config_file) not in dirs:
- dirs.append(os.path.dirname(config_file))
+ if not hasattr(config_files, 'append'):
+ config_files = [config_files]
+ for config_file in config_files:
+ if os.path.dirname(config_file) not in dirs:
+ dirs.append(os.path.dirname(config_file))
# read in the theme
parser = ConfigParser()
@@ -172,7 +178,7 @@ def load(config_file):
# merge configurations, allowing current one to override theme
template_files = config.template_files()
parser.set('Planet','template_files','')
- parser.read(config_file)
+ parser.read(config_files)
for file in config.bill_of_materials():
if not file in bom: bom.append(file)
parser.set('Planet', 'bill_of_materials', ' '.join(bom))
@@ -306,7 +312,7 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
def http_cache_directory():
if parser.has_option('Planet', 'http_cache_directory'):
- os.path.join(cache_directory(),
+ return os.path.join(cache_directory(),
parser.get('Planet', 'http_cache_directory'))
else:
return os.path.join(cache_directory(), "cache")
@@ -318,9 +324,16 @@ def cache_sources_directory():
else:
return os.path.join(cache_directory(), 'sources')
+def cache_blacklist_directory():
+ if parser.has_option('Planet', 'cache_blacklist_directory'):
+ return os.path.join(cache_directory(),
+ parser.get('Planet', 'cache_blacklist_directory'))
+ else:
+ return os.path.join(cache_directory(), 'blacklist')
+
def cache_lists_directory():
if parser.has_option('Planet', 'cache_lists_directory'):
- parser.get('Planet', 'cache_lists_directory')
+ return parser.get('Planet', 'cache_lists_directory')
else:
return os.path.join(cache_directory(), 'lists')
@@ -335,7 +348,7 @@ def feed():
def feedtype():
if parser.has_option('Planet', 'feedtype'):
- parser.get('Planet', 'feedtype')
+ return parser.get('Planet', 'feedtype')
elif feed() and feed().find('atom')>=0:
return 'atom'
elif feed() and feed().find('rss')>=0:
diff --git a/planet/csv_config.py b/planet/csv_config.py
index ba3be61..9f905a6 100755
--- a/planet/csv_config.py
+++ b/planet/csv_config.py
@@ -13,7 +13,8 @@ def csv2config(input, config=None):
reader = csv.DictReader(input)
for row in reader:
section = row[reader.fieldnames[0]]
- config.add_section(section)
+ if not config.has_section(section):
+ config.add_section(section)
for name, value in row.items():
if value and name != reader.fieldnames[0]:
config.set(section, name, value)
diff --git a/planet/publish.py b/planet/publish.py
new file mode 100644
index 0000000..36df866
--- /dev/null
+++ b/planet/publish.py
@@ -0,0 +1,26 @@
+import os, sys
+import urlparse
+import planet
+import pubsubhubbub_publisher as PuSH
+
+def publish(config):
+ log = planet.logger
+ hub = config.pubsubhubbub_hub()
+ link = config.link()
+
+ # identify feeds
+ feeds = []
+ if hub and link:
+ for root, dirs, files in os.walk(config.output_dir()):
+ for file in files:
+ if file in config.pubsubhubbub_feeds():
+ feeds.append(urlparse.urljoin(link, file))
+
+ # publish feeds
+ if feeds:
+ try:
+ PuSH.publish(hub, feeds)
+ for feed in feeds:
+ log.info("Published %s to %s\n" % (feed, hub))
+ except PuSH.PublishError, e:
+ log.error("PubSubHubbub publishing error: %s\n" % e)
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index f5e910d..e2a69eb 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -25,7 +25,7 @@ try:
except:
from md5 import new as md5
-illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
+illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text"""
@@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8')
except:
value=value.decode('iso-8859-1')
+ value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument
xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value))
@@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
def invalidate(c):
""" replace invalid characters """
- return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
+ return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:]
def ncr2c(value):
@@ -69,6 +70,7 @@ def id(xentry, entry):
if entry.has_key("id") and entry.id:
entry_id = entry.id
+ if hasattr(entry_id, 'values'): entry_id = entry_id.values()[0]
elif entry.has_key("link") and entry.link:
entry_id = entry.link
elif entry.has_key("title") and entry.title:
@@ -102,6 +104,8 @@ def links(xentry, entry):
xlink.setAttribute('type', link.get('type'))
if link.has_key('rel'):
xlink.setAttribute('rel', link.get('rel',None))
+ if link.has_key('title'):
+ xlink.setAttribute('title', link.get('title'))
if link.has_key('length'):
xlink.setAttribute('length', link.get('length'))
xentry.appendChild(xlink)
@@ -177,6 +181,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
+ if illegal_xml_chars.search(data.data):
+ data = xdoc.createTextNode(
+ illegal_xml_chars.sub(invalidate, data.data))
else:
data = div
xcontent.setAttribute('type', 'xhtml')
@@ -225,6 +232,10 @@ def source(xsource, source, bozo, format):
for contributor in source.get('contributors',[]):
author(xsource, 'contributor', contributor)
+ if not source.has_key('links') and source.has_key('href'): #rss
+ source['links'] = [{ 'href': source.get('href') }]
+ if source.has_key('title'):
+ source['links'][0]['title'] = source.get('title')
links(xsource, source)
content(xsource, 'rights', source.get('rights_detail',None), bozo)
@@ -273,6 +284,11 @@ def reconstitute(feed, entry):
date(xentry, 'updated', entry_updated(feed.feed, entry, time.gmtime()))
date(xentry, 'published', entry.get('published_parsed',None))
+ if entry.has_key('dc_date.taken'):
+ date_Taken = createTextElement(xentry, '%s:%s' % ('dc','date_Taken'), '%s' % entry.get('dc_date.taken', None))
+ date_Taken.setAttribute('xmlns:%s' % 'dc', 'http://purl.org/dc/elements/1.1/')
+ xentry.appendChild(date_Taken)
+
for tag in entry.get('tags',[]):
category(xentry, tag)
@@ -298,6 +314,21 @@ def reconstitute(feed, entry):
if entry.has_key('geo_lat') and \
entry.has_key('geo_long'):
location(xentry, (float)(entry.get('geo_long',None)), (float)(entry.get('geo_lat',None)))
+ if entry.has_key('georss_point'):
+ coordinates = re.split('[,\s]', entry.get('georss_point'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
+ elif entry.has_key('georss_line'):
+ coordinates = re.split('[,\s]', entry.get('georss_line'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
+ elif entry.has_key('georss_circle'):
+ coordinates = re.split('[,\s]', entry.get('georss_circle'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
+ elif entry.has_key('georss_box'):
+ coordinates = re.split('[,\s]', entry.get('georss_box'))
+ location(xentry, ((float)(coordinates[1])+(float)(coordinates[3]))/2, ((float)(coordinates[0])+(float)(coordinates[2]))/2)
+ elif entry.has_key('georss_polygon'):
+ coordinates = re.split('[,\s]', entry.get('georss_polygon'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
# author / contributor
author_detail = entry.get('author_detail',{})
diff --git a/planet/scrub.py b/planet/scrub.py
index 586edde..fef5c22 100644
--- a/planet/scrub.py
+++ b/planet/scrub.py
@@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
- # Run this through HTML5's serializer
- from html5lib import html5parser, sanitizer, treebuilders
+ # Run this through HTML5's sanitizer
+ doc = None
+ if 'xhtml' in node['type']:
+ try:
+ from xml.dom import minidom
+ doc = minidom.parseString(node['value'])
+ except:
+ node['type']='text/html'
+
+ if not doc:
+ from html5lib import html5parser, treebuilders
+ p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc = p.parseFragment(node['value'], encoding='utf-8')
+
from html5lib import treewalkers, serializer
- p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
- tree=treebuilders.getTreeBuilder('dom'))
- doc = p.parseFragment(node.value, encoding='utf-8')
+ from html5lib.filters import sanitizer
+ walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
- walker = treewalkers.getTreeWalker('dom')
- tree = xhtml.serialize(walker(doc), encoding='utf-8')
+ tree = xhtml.serialize(walker, encoding='utf-8')
+
node['value'] = ''.join([str(token) for token in tree])
diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py
index 49b8557..a65b121 100644
--- a/planet/shell/__init__.py
+++ b/planet/shell/__init__.py
@@ -33,7 +33,7 @@ def run(template_file, doc, mode='template'):
log.info(" %s", os.path.realpath(template_dir))
logged_modes.append(mode)
return
- template_resolved = os.path.realpath(template_resolved)
+ template_resolved = os.path.abspath(template_resolved)
# Add shell directory to the path, if not already there
shellpath = os.path.join(sys.path[0],'planet','shell')
diff --git a/planet/shell/_genshi.py b/planet/shell/_genshi.py
index 5dffab2..7880a3a 100644
--- a/planet/shell/_genshi.py
+++ b/planet/shell/_genshi.py
@@ -3,7 +3,9 @@ from xml.sax.saxutils import escape
from genshi.input import HTMLParser, XMLParser
from genshi.template import Context, MarkupTemplate
+import planet
+log = planet.logger
subscriptions = []
feed_types = [
'application/atom+xml',
@@ -28,13 +30,15 @@ def find_config(config, feed):
if link.has_key('type') and link.type in feed_types:
if link.has_key('href') and link.href in subscriptions:
return norm(dict(config.parser.items(link.href)))
-
+
# match based on name
- for sub in subscriptions:
- if config.parser.has_option(sub, 'name') and \
- norm(config.parser.get(sub, 'name')) == feed.planet_name:
- return norm(dict(config.parser.items(sub)))
+ if 'planet_name' in feed:
+ for sub in subscriptions:
+ if config.parser.has_option(sub, 'name') and \
+ norm(config.parser.get(sub, 'name')) == feed.planet_name:
+ return norm(dict(config.parser.items(sub)))
+ log.warning('Could not match subscription to config: %s', feed.link)
return {}
class XHTMLParser(object):
@@ -68,7 +72,7 @@ def run(script, doc, output_file=None, options={}):
context = Context(**options)
tmpl_fileobj = open(script)
- tmpl = MarkupTemplate(tmpl_fileobj, script)
+ tmpl = MarkupTemplate(tmpl_fileobj, script, lookup="lenient")
tmpl_fileobj.close()
if not output_file:
diff --git a/planet/shell/dj.py b/planet/shell/dj.py
index c8a54a9..d2199fc 100644
--- a/planet/shell/dj.py
+++ b/planet/shell/dj.py
@@ -19,7 +19,7 @@ def run(script, doc, output_file=None, options={}):
# I need to re-import the settings at every call because I have to
# set the TEMPLATE_DIRS variable programmatically
from django.conf import settings
- settings._wrapped = None
+ settings._wrapped=None
try:
settings.configure(
DEBUG=True, TEMPLATE_DEBUG=True,
@@ -32,7 +32,7 @@ def run(script, doc, output_file=None, options={}):
# set up the Django context by using the default htmltmpl
# datatype converters
- context = Context()
+ context = Context(autoescape=(config.django_autoescape()=='on'))
context.update(tmpl.template_info(doc))
context['Config'] = config.planet_options()
t = get_template(script)
diff --git a/planet/shell/tmpl.py b/planet/shell/tmpl.py
index 3c8cb6b..b0f238f 100644
--- a/planet/shell/tmpl.py
+++ b/planet/shell/tmpl.py
@@ -231,6 +231,7 @@ def template_info(source):
output['link'] = config.link()
output['owner_name'] = config.owner_name()
output['owner_email'] = config.owner_email()
+ output['pubsubhubbub_hub'] = config.pubsubhubbub_hub()
if config.feed():
output['feed'] = config.feed()
output['feedtype'] = config.feed().find('rss')>=0 and 'rss' or 'atom'
@@ -267,8 +268,10 @@ def run(script, doc, output_file=None, options={}):
tp.set(key, value)
if output_file:
+ basename = os.path.basename(output_file)
reluri = os.path.splitext(os.path.basename(output_file))[0]
tp.set('url', urlparse.urljoin(config.link(),reluri))
+ tp.set('fullurl', urlparse.urljoin(config.link(),basename))
output = open(output_file, "w")
output.write(tp.process(template))
diff --git a/planet/shell/xslt.py b/planet/shell/xslt.py
index 0b6579f..24173ea 100644
--- a/planet/shell/xslt.py
+++ b/planet/shell/xslt.py
@@ -8,7 +8,7 @@ def quote(string, apos):
if string.find("'")<0:
return "'" + string + "'"
- elif string.find("'")<0:
+ elif string.find('"')<0:
return '"' + string + '"'
else:
# unclear how to quote strings with both types of quotes for libxslt
diff --git a/planet/spider.py b/planet/spider.py
index 59afcb6..50d1739 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -69,6 +69,7 @@ def _is_http_uri(uri):
def writeCache(feed_uri, feed_info, data):
log = planet.logger
sources = config.cache_sources_directory()
+ blacklist = config.cache_blacklist_directory()
# capture http status
if not data.has_key("status"):
@@ -125,7 +126,7 @@ def writeCache(feed_uri, feed_info, data):
log.info("Updating feed %s", feed_uri)
# if read failed, retain cached information
- if not data.version and feed_info.version:
+ if not data.get('version') and feed_info.get('version'):
data.feed = feed_info.feed
data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
data.version = feed_info.feed.get('planet_format')
@@ -147,7 +148,7 @@ def writeCache(feed_uri, feed_info, data):
data.feed['planet_content_hash'] = data.headers['-content-hash']
# capture feed and data from the planet configuration file
- if data.version:
+ if data.get('version'):
if not data.feed.has_key('links'): data.feed['links'] = list()
feedtype = 'application/atom+xml'
if data.version.startswith('rss'): feedtype = 'application/rss+xml'
@@ -175,7 +176,9 @@ def writeCache(feed_uri, feed_info, data):
# generate an id, if none is present
if not entry.has_key('id') or not entry.id:
entry['id'] = reconstitute.id(None, entry)
- if not entry['id']: continue
+ elif hasattr(entry['id'], 'values'):
+ entry['id'] = entry['id'].values()[0]
+ if not entry['id']: continue
# determine updated date for purposes of selection
updated = ''
@@ -190,6 +193,13 @@ def writeCache(feed_uri, feed_info, data):
cache = config.cache_directory()
for updated, entry in ids.values():
+ # compute blacklist file name based on the id
+ blacklist_file = filename(blacklist, entry.id)
+
+ # check if blacklist file exists. If so, skip it.
+ if os.path.exists(blacklist_file):
+ continue
+
# compute cache file name based on the id
cache_file = filename(cache, entry.id)
@@ -420,8 +430,6 @@ def spiderPlanet(only_if_new = False):
# Process the results as they arrive
feeds_seen = {}
while fetch_queue.qsize() or parse_queue.qsize() or threads:
- while parse_queue.qsize() == 0 and threads:
- time.sleep(0.1)
while parse_queue.qsize():
(uri, feed_info, feed) = parse_queue.get(False)
try:
@@ -479,6 +487,8 @@ def spiderPlanet(only_if_new = False):
traceback.format_tb(tb)):
log.error(line.rstrip())
+ time.sleep(0.1)
+
for index in threads.keys():
if not threads[index].isAlive():
del threads[index]
diff --git a/planet/splice.py b/planet/splice.py
index f751975..b399eca 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -44,6 +44,12 @@ def splice():
link.setAttribute('type', "application/%s+xml" % config.feedtype())
feed.appendChild(link)
+ if config.pubsubhubbub_hub():
+ hub = doc.createElement('link')
+ hub.setAttribute('rel', 'hub')
+ hub.setAttribute('href', config.pubsubhubbub_hub())
+ feed.appendChild(hub)
+
if config.link():
link = doc.createElement('link')
link.setAttribute('rel', 'alternate')
@@ -58,6 +64,21 @@ def splice():
data=feedparser.parse(filename(sources,sub))
if data.feed.has_key('id'): sub_ids.append(data.feed.id)
if not data.feed: continue
+
+ # warn on missing links
+ if not data.feed.has_key('planet_message'):
+ if not data.feed.has_key('links'): data.feed['links'] = []
+
+ for link in data.feed.links:
+ if link.rel == 'self': break
+ else:
+ log.debug('missing self link for ' + sub)
+
+ for link in data.feed.links:
+ if link.rel == 'alternate' and 'html' in link.type: break
+ else:
+ log.debug('missing html link for ' + sub)
+
xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
reconstitute.source(xdoc.documentElement, data.feed, None, None)
@@ -68,6 +89,7 @@ def splice():
# insert entry information
items = 0
count = {}
+ atomNS='http://www.w3.org/2005/Atom'
new_feed_items = config.new_feed_items()
for mtime,file in dir:
if index != None:
@@ -81,7 +103,7 @@ def splice():
# number of entries contributed by this feed does not exceed
# config.new_feed_items
entry.normalize()
- sources = entry.getElementsByTagName('source')
+ sources = entry.getElementsByTagNameNS(atomNS, 'source')
if sources:
ids = sources[0].getElementsByTagName('id')
if ids:
@@ -93,6 +115,8 @@ def splice():
ids = sources[0].getElementsByTagName('planet:id')
if not ids: continue
id = ids[0].childNodes[0].nodeValue
+ if id not in sub_ids:
+ log.warn('Skipping: ' + id)
if id not in sub_ids: continue
# add entry to feed
diff --git a/planet/vendor/feedparser.py b/planet/vendor/feedparser.py
index 6518126..76167ce 100755
--- a/planet/vendor/feedparser.py
+++ b/planet/vendor/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
-__version__ = "4.2-pre-" + "$Revision$"[11:14] + "-svn"
+__version__ = "4.2-pre-" + "$Revision: 314 $"[11:14] + "-svn"
__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@@ -1131,7 +1131,7 @@ class _FeedParserMixin:
def _getContext(self):
if self.insource:
context = self.sourcedata
- elif self.inimage:
+ elif self.inimage and self.feeddata.has_key('image'):
context = self.feeddata['image']
elif self.intextinput:
context = self.feeddata['textinput']
@@ -1595,9 +1595,12 @@ if _XML_AVAILABLE:
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
+ self.decls = {}
def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
+ if uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
@@ -1622,7 +1625,7 @@ if _XML_AVAILABLE:
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD = {}
+ attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
@@ -1679,8 +1682,11 @@ if _XML_AVAILABLE:
class _BaseHTMLProcessor(sgmllib.SGMLParser):
special = re.compile('''[<>'"]''')
bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
+ elements_no_end_tag = [
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ]
def __init__(self, encoding, type):
self.encoding = encoding
@@ -2461,6 +2467,15 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
+ # add implicit namespaces to html5 inline svg/mathml
+ if self.type.endswith('html'):
+ if tag=='svg':
+ if not dict(attrs).get('xmlns'):
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ if not dict(attrs).get('xmlns'):
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
# not otherwise acceptable, perhaps it is MathML or SVG?
if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
self.mathmlOK += 1
diff --git a/planet/vendor/html5lib/__init__.py b/planet/vendor/html5lib/__init__.py
index 7a20994..ae64f14 100644
--- a/planet/vendor/html5lib/__init__.py
+++ b/planet/vendor/html5lib/__init__.py
@@ -8,9 +8,10 @@ Example usage:
import html5lib
f = open("my_document.html")
-p = html5lib.HTMLParser()
-tree = p.parse(f)
+tree = html5lib.parse(f)
"""
-from html5parser import HTMLParser, parse
+__version__ = "%(version)s"
+from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
+from treewalkers import getTreeWalker
from serializer import serialize
diff --git a/planet/vendor/html5lib/constants.py b/planet/vendor/html5lib/constants.py
index c9f5883..f9521c8 100644
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@@ -180,6 +180,8 @@ E = {
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
+ "unexpected-form-in-table":
+ _(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
@@ -256,21 +258,18 @@ E = {
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
+ "eof-in-script-in-script":
+ _(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
+ "unexpected-end-tag-before-html":
+ _(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
-contentModelFlags = {
- "PCDATA":0,
- "RCDATA":1,
- "CDATA":2,
- "PLAINTEXT":3
-}
-
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
@@ -509,6 +508,8 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
+xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
+
entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
@@ -878,6 +879,44 @@ entities = {
"zwnj;": u"\u200C"
}
+replacementCharacters = {
+ 0x0:u"\uFFFD",
+ 0x0d:u"\u000A",
+ 0x80:u"\u20AC",
+ 0x81:u"\u0081",
+ 0x81:u"\u0081",
+ 0x82:u"\u201A",
+ 0x83:u"\u0192",
+ 0x84:u"\u201E",
+ 0x85:u"\u2026",
+ 0x86:u"\u2020",
+ 0x87:u"\u2021",
+ 0x88:u"\u02C6",
+ 0x89:u"\u2030",
+ 0x8A:u"\u0160",
+ 0x8B:u"\u2039",
+ 0x8C:u"\u0152",
+ 0x8D:u"\u008D",
+ 0x8E:u"\u017D",
+ 0x8F:u"\u008F",
+ 0x90:u"\u0090",
+ 0x91:u"\u2018",
+ 0x92:u"\u2019",
+ 0x93:u"\u201C",
+ 0x94:u"\u201D",
+ 0x95:u"\u2022",
+ 0x96:u"\u2013",
+ 0x97:u"\u2014",
+ 0x98:u"\u02DC",
+ 0x99:u"\u2122",
+ 0x9A:u"\u0161",
+ 0x9B:u"\u203A",
+ 0x9C:u"\u0153",
+ 0x9D:u"\u009D",
+ 0x9E:u"\u017E",
+ 0x9F:u"\u0178",
+}
+
encodings = {
'437': 'cp437',
'850': 'cp850',
diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
index a8e5a1f..5ff742a 100644
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -4,6 +4,29 @@ except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
+
+try:
+ any
+except:
+ # Implement 'any' for python 2.4 and previous
+ def any(iterable):
+ for element in iterable:
+ if element:
+ return True
+ return False
+
+try:
+ "abc".startswith(("a", "b"))
+ def startswithany(str, prefixes):
+ return str.startswith(prefixes)
+except:
+ # Python 2.4 doesn't accept a tuple as argument to string startswith
+ def startswithany(str, prefixes):
+ for prefix in prefixes:
+ if str.startswith(prefix):
+ return True
+ return False
+
import sys
import inputstream
@@ -14,7 +37,7 @@ from treebuilders._base import Marker
from treebuilders import simpletree
import utils
-from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
+from constants import spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
@@ -26,6 +49,12 @@ def parse(doc, treebuilder="simpletree", encoding=None,
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding)
+def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
+ namespaceHTMLElements=True):
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parseFragment(doc, container=container, encoding=encoding)
+
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
@@ -60,7 +89,7 @@ class HTMLParser(object):
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
"afterHead": AfterHeadPhase(self, self.tree),
"inBody": InBodyPhase(self, self.tree),
- "inCDataRCData": InCDataRCDataPhase(self, self.tree),
+ "text": TextPhase(self, self.tree),
"inTable": InTablePhase(self, self.tree),
"inTableText": InTableTextPhase(self, self.tree),
"inCaption": InCaptionPhase(self, self.tree),
@@ -107,14 +136,14 @@ class HTMLParser(object):
self.innerHTML = self.container.lower()
if self.innerHTML in cdataElements:
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+ self.tokenizer.state = self.tokenizer.rcdataState
elif self.innerHTML in rcdataElements:
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+ self.tokenizer.state = self.tokenizer.rawtextState
elif self.innerHTML == 'plaintext':
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+ self.tokenizer.state = self.tokenizer.plaintextState
else:
- # contentModelFlag already is PCDATA
- #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+ # state already is data state
+ # self.tokenizer.state = self.tokenizer.dataState
pass
self.phase = self.phases["beforeHtml"]
self.phase.insertHtmlElement()
@@ -152,8 +181,6 @@ class HTMLParser(object):
for token in self.normalizedTokens():
- #print self.phase.__class__.__name__
- #print token
type = token["type"]
if type == CharactersToken:
self.phase.processCharacters(token)
@@ -376,18 +403,22 @@ class HTMLParser(object):
self.phase = self.phases["inBody"]
break
- def parseRCDataCData(self, token, contentType):
- """Generic (R)CDATA Parsing algorithm
- contentType - RCDATA or CDATA
+ def parseRCDataRawtext(self, token, contentType):
+ """Generic RCDATA/RAWTEXT Parsing algorithm
+ contentType - RCDATA or RAWTEXT
"""
- assert contentType in ("CDATA", "RCDATA")
+ assert contentType in ("RAWTEXT", "RCDATA")
element = self.tree.insertElement(token)
- self.tokenizer.contentModelFlag = contentModelFlags[contentType]
+
+ if contentType == "RAWTEXT":
+ self.tokenizer.state = self.tokenizer.rawtextState
+ else:
+ self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
- self.phase = self.phases["inCDataRCData"]
+ self.phase = self.phases["text"]
class Phase(object):
"""Base class for helper object that implements each phase of processing
@@ -441,34 +472,24 @@ class Phase(object):
self.endTagHandler[token["name"]](token)
class InitialPhase(Phase):
- # This phase deals with error handling as well which is currently not
- # covered in the specification. The error handling is typically known as
- # "quirks mode". It is expected that a future version of HTML5 will defin
- # this.
- def processEOF(self):
- self.parser.parseError("expected-doctype-but-got-eof")
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
- self.parser.phase.processEOF()
-
+ def processSpaceCharacters(self, token):
+ pass
+
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
-
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId != None or
- systemId != None):
+ systemId != None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
- if systemId is None:
- systemId = ""
self.tree.insertDoctype(token)
@@ -476,117 +497,108 @@ class InitialPhase(Phase):
publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html"
- or publicId in
- ("+//silmaril//dtd html pro v0r11 19970101//en",
- "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
- "-//as//dtd html 3.0 aswedit + extensions//en",
- "-//ietf//dtd html 2.0 level 1//en",
- "-//ietf//dtd html 2.0 level 2//en",
- "-//ietf//dtd html 2.0 strict level 1//en",
- "-//ietf//dtd html 2.0 strict level 2//en",
- "-//ietf//dtd html 2.0 strict//en",
- "-//ietf//dtd html 2.0//en",
- "-//ietf//dtd html 2.1e//en",
- "-//ietf//dtd html 3.0//en",
- "-//ietf//dtd html 3.0//en//",
- "-//ietf//dtd html 3.2 final//en",
- "-//ietf//dtd html 3.2//en",
- "-//ietf//dtd html 3//en",
- "-//ietf//dtd html level 0//en",
- "-//ietf//dtd html level 0//en//2.0",
- "-//ietf//dtd html level 1//en",
- "-//ietf//dtd html level 1//en//2.0",
- "-//ietf//dtd html level 2//en",
- "-//ietf//dtd html level 2//en//2.0",
- "-//ietf//dtd html level 3//en",
- "-//ietf//dtd html level 3//en//3.0",
- "-//ietf//dtd html strict level 0//en",
- "-//ietf//dtd html strict level 0//en//2.0",
- "-//ietf//dtd html strict level 1//en",
- "-//ietf//dtd html strict level 1//en//2.0",
- "-//ietf//dtd html strict level 2//en",
- "-//ietf//dtd html strict level 2//en//2.0",
- "-//ietf//dtd html strict level 3//en",
- "-//ietf//dtd html strict level 3//en//3.0",
- "-//ietf//dtd html strict//en",
- "-//ietf//dtd html strict//en//2.0",
- "-//ietf//dtd html strict//en//3.0",
- "-//ietf//dtd html//en",
- "-//ietf//dtd html//en//2.0",
- "-//ietf//dtd html//en//3.0",
- "-//metrius//dtd metrius presentational//en",
- "-//microsoft//dtd internet explorer 2.0 html strict//en",
- "-//microsoft//dtd internet explorer 2.0 html//en",
- "-//microsoft//dtd internet explorer 2.0 tables//en",
- "-//microsoft//dtd internet explorer 3.0 html strict//en",
- "-//microsoft//dtd internet explorer 3.0 html//en",
- "-//microsoft//dtd internet explorer 3.0 tables//en",
- "-//netscape comm. corp.//dtd html//en",
- "-//netscape comm. corp.//dtd strict html//en",
- "-//o'reilly and associates//dtd html 2.0//en",
- "-//o'reilly and associates//dtd html extended 1.0//en",
- "-//o'reilly and associates//dtd html extended relaxed 1.0//en",
- "-//spyglass//dtd html 2.0 extended//en",
- "-//sq//dtd html 2.0 hotmetal + extensions//en",
- "-//sun microsystems corp.//dtd hotjava html//en",
- "-//sun microsystems corp.//dtd hotjava strict html//en",
- "-//w3c//dtd html 3 1995-03-24//en",
- "-//w3c//dtd html 3.2 draft//en",
- "-//w3c//dtd html 3.2 final//en",
- "-//w3c//dtd html 3.2//en",
- "-//w3c//dtd html 3.2s draft//en",
- "-//w3c//dtd html 4.0 frameset//en",
- "-//w3c//dtd html 4.0 transitional//en",
- "-//w3c//dtd html experimental 19960712//en",
- "-//w3c//dtd html experimental 970421//en",
- "-//w3c//dtd w3 html//en",
- "-//w3o//dtd w3 html 3.0//en",
- "-//w3o//dtd w3 html 3.0//en//",
- "-//w3o//dtd w3 html strict 3.0//en//",
- "-//webtechs//dtd mozilla html 2.0//en",
- "-//webtechs//dtd mozilla html//en",
- "-/w3c/dtd html 4.0 transitional/en",
- "html")
- or (publicId in
- ("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)
- or (systemId != None and
- systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
+ or startswithany(publicId,
+ ("+//silmaril//dtd html pro v0r11 19970101//",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//"))
+ or publicId in
+ ("-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html")
+ or startswithany(publicId,
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId == None
+ or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
- elif (publicId in
- ("-//w3c//dtd xhtml 1.0 frameset//EN",
- "-//w3c//dtd xhtml 1.0 transitional//EN")
- or (publicId in
- ("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)):
+ elif (startswithany(publicId,
+ ("-//w3c//dtd xhtml 1.0 frameset//",
+ "-//w3c//dtd xhtml 1.0 transitional//"))
+ or startswithany(publicId,
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId != None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
-
- def processSpaceCharacters(self, token):
- pass
+
+ def anythingElse(self):
+ self.parser.compatMode = "quirks"
+ self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processCharacters(token)
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processStartTag(token)
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processEndTag(token)
+
+ def processEOF(self):
+ self.parser.parseError("expected-doctype-but-got-eof")
+ self.anythingElse()
+ self.parser.phase.processEOF()
class BeforeHtmlPhase(Phase):
@@ -617,8 +629,12 @@ class BeforeHtmlPhase(Phase):
self.parser.phase.processStartTag(token)
def processEndTag(self, token):
- self.insertHtmlElement()
- self.parser.phase.processEndTag(token)
+ if token["name"] not in ("head", "body", "html", "br"):
+ self.parser.parseError("unexpected-end-tag-before-html",
+ {"name": token["name"]})
+ else:
+ self.insertHtmlElement()
+ self.parser.phase.processEndTag(token)
class BeforeHeadPhase(Phase):
@@ -632,7 +648,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
- (("head", "br"), self.endTagImplyHead)
+ (("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@@ -647,6 +663,9 @@ class BeforeHeadPhase(Phase):
self.startTagHead(impliedTagToken("head", "StartTag"))
self.parser.phase.processCharacters(token)
+ def startTagHtml(self, token):
+ self.parser.phases["inBody"].processStartTag(token)
+
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
@@ -673,8 +692,8 @@ class InHeadPhase(Phase):
("title", self.startTagTitle),
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
("script", self.startTagScript),
- (("base", "link", "command", "eventsource"),
- self.startTagBaseLinkCommandEventsource),
+ (("base", "link", "command"),
+ self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
@@ -709,7 +728,7 @@ class InHeadPhase(Phase):
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
- def startTagBaseLinkCommandEventsource(self, token):
+ def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
@@ -724,23 +743,27 @@ class InHeadPhase(Phase):
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif "content" in attributes:
- data = inputstream.EncodingBytes(
- attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
+ # Encoding it as UTF-8 here is a hack, as really we should pass
+ # the abstract Unicode string, and just use the
+ # ContentAttrParser on that, but using UTF-8 allows all chars
+ # to be encoded and as a ASCII-superset works.
+ data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
- self.parser.parseRCDataCData(token, "RCDATA")
+ self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoScriptNoFramesStyle(self, token):
#Need to decide whether to implement the scripting-disabled case
- self.parser.parseRCDataCData(token, "CDATA")
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagScript(self, token):
- #I think this is equivalent to the CDATA stuff since we don't execute script
- #self.tree.insertElement(token)
- self.parser.parseRCDataCData(token, "CDATA")
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+ self.parser.originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
@@ -819,7 +842,6 @@ class AfterHeadPhase(Phase):
self.parser.phase.processStartTag(token)
def endTagHtmlBodyBr(self, token):
- #This is not currently in the spec
self.anythingElse()
self.parser.phase.processEndTag(token)
@@ -833,8 +855,8 @@ class AfterHeadPhase(Phase):
class InBodyPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-body
- # the crazy mode
+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+ # the really-really-really-very crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
@@ -843,15 +865,16 @@ class InBodyPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
- (("base", "link", "meta", "script", "style", "title"),
- self.startTagProcessInHead),
+ (("base", "command", "link", "meta", "noframes", "script", "style",
+ "title"), self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "datagrid",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
- "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "listing",
- "menu", "nav", "ol", "p", "pre", "section", "ul"),
+ "details", "dir", "div", "dl", "fieldset", "figure",
+ "footer", "header", "hgroup", "menu", "nav", "ol", "p",
+ "section", "ul"),
self.startTagCloseP),
+ (("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext",self.startTagPlaintext),
@@ -865,13 +888,14 @@ class InBodyPhase(Phase):
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "basefont", "bgsound", "br", "embed", "img", "input",
- "keygen", "param", "spacer", "wbr"), self.startTagVoidFormatting),
+ "keygen", "spacer", "wbr"), self.startTagVoidFormatting),
+ (("param", "source"), self.startTagParamSource),
("hr", self.startTagHr),
("image", self.startTagImage),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
- (("noembed", "noframes", "noscript"), self.startTagCdata),
+ (("noembed", "noframes", "noscript"), self.startTagRawtext),
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
@@ -879,8 +903,7 @@ class InBodyPhase(Phase):
(("svg"), self.startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
- "tr"), self.startTagMisplaced),
- (("event-source", "command"), self.startTagNew)
+ "tr"), self.startTagMisplaced)
])
self.startTagHandler.default = self.startTagOther
@@ -888,9 +911,9 @@ class InBodyPhase(Phase):
("body",self.endTagBody),
("html",self.endTagHtml),
(("address", "article", "aside", "blockquote", "center", "datagrid",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
- "footer", "header", "listing", "menu", "nav", "ol", "pre", "section",
- "ul"), self.endTagBlock),
+ "details", "dir", "div", "dl", "fieldset", "figure",
+ "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre",
+ "section", "ul"), self.endTagBlock),
("form", self.endTagForm),
("p",self.endTagP),
(("dd", "dt", "li"), self.endTagListItem),
@@ -933,14 +956,10 @@ class InBodyPhase(Phase):
self.tree.insertText(data)
def processCharacters(self, token):
- # XXX The specification says to do this for every character at the
- # moment, but apparently that doesn't match the real world so we don't
- # do it for space characters.
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
- self.framesetOK = False
+ self.parser.framesetOK = False
- #This matches the current spec but may not match the real world
def processSpaceCharacters(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
@@ -976,9 +995,13 @@ class InBodyPhase(Phase):
if self.tree.elementInScope("p"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
- if token["name"] in ("pre", "listing"):
- self.parser.framesetOK = False
- self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+
+ def startTagPreListing(self, token):
+ if self.tree.elementInScope("p"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.parser.framesetOK = False
+ self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagForm(self, token):
if self.tree.formPointer:
@@ -991,39 +1014,31 @@ class InBodyPhase(Phase):
def startTagListItem(self, token):
self.parser.framesetOK = False
- if self.tree.elementInScope("p"):
- self.endTagP(impliedTagToken("p"))
- stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
- stopName = stopNames[token["name"]]
- # AT Use reversed in Python 2.4...
- for i, node in enumerate(self.tree.openElements[::-1]):
- if node.name in stopName:
- poppedNodes = []
- for j in range(i+1):
- poppedNodes.append(self.tree.openElements.pop())
- if i >= 1:
- self.parser.parseError(
- i == 1 and "missing-end-tag" or "missing-end-tags",
- {"name": u", ".join([item.name
- for item
- in poppedNodes[:-1]])})
- break
-
- # Phrasing elements are all non special, non scoping, non
- # formatting elements
- if (node.nameTuple in
- (specialElements | scopingElements)
- and node.name not in ("address", "div")):
+ stopNamesMap = {"li":["li"],
+ "dt":["dt", "dd"],
+ "dd":["dt", "dd"]}
+ stopNames = stopNamesMap[token["name"]]
+ for node in reversed(self.tree.openElements):
+ if node.name in stopNames:
+ self.parser.phase.processEndTag(
+ impliedTagToken(node.name, "EndTag"))
+ break
+ if (node.nameTuple in (scopingElements | specialElements) and
+ node.name not in ("address", "div", "p")):
break
- # Always insert an <li> element.
+
+ if self.tree.elementInScope("p"):
+ self.parser.phase.processEndTag(
+ impliedTagToken("p", "EndTag"))
+
self.tree.insertElement(token)
def startTagPlaintext(self, token):
if self.tree.elementInScope("p"):
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
- self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]
+ self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
def startTagHeading(self, token):
if self.tree.elementInScope("p"):
@@ -1031,15 +1046,6 @@ class InBodyPhase(Phase):
if self.tree.openElements[-1].name in headingElements:
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
self.tree.openElements.pop()
- # Uncomment the following for IE7 behavior:
- #
- #for item in headingElements:
- # if self.tree.elementInScope(item):
- # self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
- # item = self.tree.openElements.pop()
- # while item.name not in headingElements:
- # item = self.tree.openElements.pop()
- # break
self.tree.insertElement(token)
def startTagA(self, token):
@@ -1088,9 +1094,11 @@ class InBodyPhase(Phase):
self.parser.framesetOK = False
def startTagXmp(self, token):
+ if self.tree.elementInScope("p"):
+ self.endTagP(impliedTagToken("p"))
self.tree.reconstructActiveFormattingElements()
- self.parser.parseRCDataCData(token, "CDATA")
self.parser.framesetOK = False
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagTable(self, token):
if self.parser.compatMode != "quirks":
@@ -1107,6 +1115,11 @@ class InBodyPhase(Phase):
token["selfClosingAcknowledged"] = True
self.parser.framesetOK = False
+ def startTagParamSource(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
def startTagHr(self, token):
if self.tree.elementInScope("p"):
self.endTagP(impliedTagToken("p"))
@@ -1156,19 +1169,18 @@ class InBodyPhase(Phase):
self.processEndTag(impliedTagToken("form"))
def startTagTextarea(self, token):
- # XXX Form element pointer checking here as well...
self.tree.insertElement(token)
- self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
+ self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
self.parser.framesetOK = False
def startTagIFrame(self, token):
self.parser.framesetOK = False
- self.startTagCdata(token)
+ self.startTagRawtext(token)
- def startTagCdata(self, token):
+ def startTagRawtext(self, token):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
- self.parser.parseRCDataCData(token, "CDATA")
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagOpt(self, token):
if self.tree.elementInScope("option"):
@@ -1238,46 +1250,34 @@ class InBodyPhase(Phase):
"""
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
- def startTagNew(self, token):
- """New HTML5 elements, "event-source", "section", "nav",
- "article", "aside", "header", "footer", "datagrid", "command"
- """
- #2007-08-30 - MAP - commenting out this write to sys.stderr because
- # it's really annoying me when I run the validator tests
- #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
- self.startTagOther(token)
- #raise NotImplementedError
-
def startTagOther(self, token):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(token)
def endTagP(self, token):
- if self.tree.elementInScope("p"):
- self.tree.generateImpliedEndTags("p")
- if self.tree.openElements[-1].name != "p":
+ if not self.tree.elementInScope("p"):
+ self.startTagCloseP(impliedTagToken("p", "StartTag"))
self.parser.parseError("unexpected-end-tag", {"name": "p"})
- if self.tree.elementInScope("p"):
- while self.tree.elementInScope("p"):
- self.tree.openElements.pop()
+ self.endTagP(impliedTagToken("p", "EndTag"))
else:
- self.startTagCloseP(impliedTagToken("p", "StartTag"))
- self.endTagP(impliedTagToken("p"))
+ self.tree.generateImpliedEndTags("p")
+ if self.tree.openElements[-1].name != "p":
+ self.parser.parseError("unexpected-end-tag", {"name": "p"})
+ node = self.tree.openElements.pop()
+ while node.name != "p":
+ node = self.tree.openElements.pop()
def endTagBody(self, token):
- # XXX Need to take open <p> tags into account here. We shouldn't imply
- # </p> but we should not throw a parse error either. Specification is
- # likely to be updated.
- if (len(self.tree.openElements) == 1 or
- self.tree.openElements[1].name != "body"):
- # innerHTML case
+ if not self.tree.elementInScope("body"):
self.parser.parseError()
return
elif self.tree.openElements[-1].name != "body":
for node in self.tree.openElements[2:]:
- if node.name not in frozenset(("dd", "dt", "li", "p",
+ if node.name not in frozenset(("dd", "dt", "li", "optgroup",
+ "option", "p", "rp", "rt",
"tbody", "td", "tfoot",
- "th", "thead", "tr")):
+ "th", "thead", "tr", "body",
+ "html")):
#Not sure this is the correct name for the parse error
self.parser.parseError(
"expected-one-end-tag-but-got-another",
@@ -1286,8 +1286,9 @@ class InBodyPhase(Phase):
self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, token):
- self.endTagBody(impliedTagToken("body"))
- if not self.parser.innerHTML:
+ #We repeat the test for the body end tag token being ignored here
+ if self.tree.elementInScope("body"):
+ self.endTagBody(impliedTagToken("body"))
self.parser.phase.processEndTag(token)
def endTagBlock(self, token):
@@ -1307,7 +1308,7 @@ class InBodyPhase(Phase):
def endTagForm(self, token):
node = self.tree.formPointer
self.tree.formPointer = None
- if node is None or not self.tree.elementInScope(token["name"]):
+ if node is None or not self.tree.elementInScope(node.name):
self.parser.parseError("unexpected-end-tag",
{"name":"form"})
else:
@@ -1315,17 +1316,21 @@ class InBodyPhase(Phase):
if self.tree.openElements[-1].name != node:
self.parser.parseError("end-tag-too-early-ignored",
{"name": "form"})
- self.tree.openElements.remove(node)
+ self.tree.openElements.remove(node)
def endTagListItem(self, token):
- # AT Could merge this with the Block case
- if self.tree.elementInScope(token["name"]):
- self.tree.generateImpliedEndTags(token["name"])
-
- if self.tree.openElements[-1].name != token["name"]:
- self.parser.parseError("end-tag-too-early", {"name": token["name"]})
-
- if self.tree.elementInScope(token["name"]):
+ if token["name"] == "li":
+ variant = "list"
+ else:
+ variant = None
+ if not self.tree.elementInScope(token["name"], variant=variant):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+ else:
+ self.tree.generateImpliedEndTags(exclude = token["name"])
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError(
+ "end-tag-too-early",
+ {"name": token["name"]})
node = self.tree.openElements.pop()
while node.name != token["name"]:
node = self.tree.openElements.pop()
@@ -1352,26 +1357,28 @@ class InBodyPhase(Phase):
name = token["name"]
while True:
# Step 1 paragraph 1
- afeElement = self.tree.elementInActiveFormattingElements(
+ formattingElement = self.tree.elementInActiveFormattingElements(
token["name"])
- if not afeElement or (afeElement in self.tree.openElements and
- not self.tree.elementInScope(afeElement.name)):
+ if not formattingElement or (formattingElement in
+ self.tree.openElements and
+ not self.tree.elementInScope(
+ formattingElement.name)):
self.parser.parseError("adoption-agency-1.1", {"name": token["name"]})
return
# Step 1 paragraph 2
- elif afeElement not in self.tree.openElements:
+ elif formattingElement not in self.tree.openElements:
self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
- self.tree.activeFormattingElements.remove(afeElement)
+ self.tree.activeFormattingElements.remove(formattingElement)
return
# Step 1 paragraph 3
- if afeElement != self.tree.openElements[-1]:
+ if formattingElement != self.tree.openElements[-1]:
self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
# Step 2
# Start of the adoption agency algorithm proper
- afeIndex = self.tree.openElements.index(afeElement)
+ afeIndex = self.tree.openElements.index(formattingElement)
furthestBlock = None
for element in self.tree.openElements[afeIndex:]:
if (element.nameTuple in
@@ -1382,7 +1389,7 @@ class InBodyPhase(Phase):
# Step 3
if furthestBlock is None:
element = self.tree.openElements.pop()
- while element != afeElement:
+ while element != formattingElement:
element = self.tree.openElements.pop()
self.tree.activeFormattingElements.remove(element)
return
@@ -1397,7 +1404,7 @@ class InBodyPhase(Phase):
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
- bookmark = self.tree.activeFormattingElements.index(afeElement)
+ bookmark = self.tree.activeFormattingElements.index(formattingElement)
# Step 6
lastNode = node = furthestBlock
@@ -1412,7 +1419,7 @@ class InBodyPhase(Phase):
self.tree.openElements.index(node)-1]
self.tree.openElements.remove(tmpNode)
# Step 6.3
- if node == afeElement:
+ if node == formattingElement:
break
# Step 6.4
if lastNode == furthestBlock:
@@ -1429,7 +1436,7 @@ class InBodyPhase(Phase):
self.tree.openElements.index(node)] = clone
node = clone
- # Step 7.6
+ # Step 6.6
# Remove lastNode from its parents, if any
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
@@ -1447,7 +1454,7 @@ class InBodyPhase(Phase):
commonAncestor.appendChild(lastNode)
# Step 8
- clone = afeElement.cloneNode()
+ clone = formattingElement.cloneNode()
# Step 9
furthestBlock.reparentChildren(clone)
@@ -1456,11 +1463,11 @@ class InBodyPhase(Phase):
furthestBlock.appendChild(clone)
# Step 11
- self.tree.activeFormattingElements.remove(afeElement)
+ self.tree.activeFormattingElements.remove(formattingElement)
self.tree.activeFormattingElements.insert(bookmark, clone)
# Step 12
- self.tree.openElements.remove(afeElement)
+ self.tree.openElements.remove(formattingElement)
self.tree.openElements.insert(
self.tree.openElements.index(furthestBlock) + 1, clone)
@@ -1498,7 +1505,7 @@ class InBodyPhase(Phase):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
break
-class InCDataRCDataPhase(Phase):
+class TextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([])
@@ -1518,7 +1525,7 @@ class InCDataRCDataPhase(Phase):
self.parser.phase.processEOF()
def startTagOther(self, token):
- assert False, "Tried to process start tag %s in (R)CDATA mode"%name
+ assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode"%name
def endTagScript(self, token):
node = self.tree.openElements.pop()
@@ -1544,7 +1551,8 @@ class InTablePhase(Phase):
(("td", "th", "tr"), self.startTagImplyTbody),
("table", self.startTagTable),
(("style", "script"), self.startTagStyleScript),
- ("input", self.startTagInput)
+ ("input", self.startTagInput),
+ ("form", self.startTagForm)
])
self.startTagHandler.default = self.startTagOther
@@ -1638,6 +1646,11 @@ class InTablePhase(Phase):
else:
self.startTagOther(token)
+ def startTagForm(self, token):
+ self.parser.parseError("unexpected-form-in-table")
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+
def startTagOther(self, token):
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
if "tainted" not in self.getCurrentTable()._flags:
@@ -1648,7 +1661,7 @@ class InTablePhase(Phase):
self.tree.insertFromTable = False
def endTagTable(self, token):
- if self.tree.elementInScope("table", True):
+ if self.tree.elementInScope("table", variant="table"):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
self.parser.parseError("end-tag-too-early-named",
@@ -1695,10 +1708,10 @@ class InTableTextPhase(Phase):
self.phase = self.originalPhase
self.phase.processComment(token)
- def processEOF(self, token):
+ def processEOF(self):
self.flushCharacters()
self.phase = self.originalPhase
- self.phase.processEOF(token)
+ self.phase.processEOF()
def processCharacters(self, token):
self.characterTokens.append(token)
@@ -1740,7 +1753,7 @@ class InCaptionPhase(Phase):
self.endTagHandler.default = self.endTagOther
def ignoreEndTagCaption(self):
- return not self.tree.elementInScope("caption", True)
+ return not self.tree.elementInScope("caption", variant="table")
def processEOF(self):
self.parser.phases["inBody"].processEOF()
@@ -1911,9 +1924,9 @@ class InTableBodyPhase(Phase):
def startTagTableOther(self, token):
# XXX AT Any ideas on how to share this with endTagTable?
- if (self.tree.elementInScope("tbody", True) or
- self.tree.elementInScope("thead", True) or
- self.tree.elementInScope("tfoot", True)):
+ if (self.tree.elementInScope("tbody", variant="table") or
+ self.tree.elementInScope("thead", variant="table") or
+ self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
@@ -1926,7 +1939,7 @@ class InTableBodyPhase(Phase):
self.parser.phases["inTable"].processStartTag(token)
def endTagTableRowGroup(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.clearStackToTableBodyContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
@@ -1935,9 +1948,9 @@ class InTableBodyPhase(Phase):
{"name": token["name"]})
def endTagTable(self, token):
- if (self.tree.elementInScope("tbody", True) or
- self.tree.elementInScope("thead", True) or
- self.tree.elementInScope("tfoot", True)):
+ if (self.tree.elementInScope("tbody", variant="table") or
+ self.tree.elementInScope("thead", variant="table") or
+ self.tree.elementInScope("tfoot", variant="table")):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(
impliedTagToken(self.tree.openElements[-1].name))
@@ -1983,7 +1996,7 @@ class InRowPhase(Phase):
self.tree.openElements.pop()
def ignoreEndTagTr(self):
- return not self.tree.elementInScope("tr", tableVariant=True)
+ return not self.tree.elementInScope("tr", variant="table")
# the rest
def processEOF(self):
@@ -2030,7 +2043,7 @@ class InRowPhase(Phase):
self.parser.phase.processEndTag(token)
def endTagTableRowGroup(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.endTagTr("tr")
self.parser.phase.processEndTag(token)
else:
@@ -2064,9 +2077,9 @@ class InCellPhase(Phase):
# helper
def closeCell(self):
- if self.tree.elementInScope("td", True):
+ if self.tree.elementInScope("td", variant="table"):
self.endTagTableCell(impliedTagToken("td"))
- elif self.tree.elementInScope("th", True):
+ elif self.tree.elementInScope("th", variant="table"):
self.endTagTableCell(impliedTagToken("th"))
# the rest
@@ -2077,8 +2090,8 @@ class InCellPhase(Phase):
self.parser.phases["inBody"].processCharacters(token)
def startTagTableOther(self, token):
- if (self.tree.elementInScope("td", True) or
- self.tree.elementInScope("th", True)):
+ if (self.tree.elementInScope("td", variant="table") or
+ self.tree.elementInScope("th", variant="table")):
self.closeCell()
self.parser.phase.processStartTag(token)
else:
@@ -2093,7 +2106,7 @@ class InCellPhase(Phase):
self.parser.phases["inBody"].processStartTag
def endTagTableCell(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.tree.generateImpliedEndTags(token["name"])
if self.tree.openElements[-1].name != token["name"]:
self.parser.parseError("unexpected-cell-end-tag",
@@ -2113,7 +2126,7 @@ class InCellPhase(Phase):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def endTagImply(self, token):
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.closeCell()
self.parser.phase.processEndTag(token)
else:
@@ -2178,7 +2191,7 @@ class InSelectPhase(Phase):
def startTagInput(self, token):
self.parser.parseError("unexpected-input-in-select")
- if self.tree.elementInScope("select", True):
+ if self.tree.elementInScope("select", variant="table"):
self.endTagSelect("select")
self.parser.phase.processStartTag(token)
@@ -2207,7 +2220,7 @@ class InSelectPhase(Phase):
{"name": "optgroup"})
def endTagSelect(self, token):
- if self.tree.elementInScope("select", True):
+ if self.tree.elementInScope("select", variant="table"):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
@@ -2219,7 +2232,7 @@ class InSelectPhase(Phase):
def endTagTableElements(self, token):
self.parser.parseError("unexpected-end-tag-in-select",
{"name": token["name"]})
- if self.tree.elementInScope(token["name"], True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.endTagSelect("select")
self.parser.phase.processEndTag(token)
@@ -2260,7 +2273,7 @@ class InSelectInTablePhase(Phase):
def endTagTable(self, token):
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
- if self.tree.elementInScope(token["name"], tableVariant=True):
+ if self.tree.elementInScope(token["name"], variant="table"):
self.endTagOther(impliedTagToken("select"))
self.parser.phase.processEndTag(token)
diff --git a/planet/vendor/html5lib/ihatexml.py b/planet/vendor/html5lib/ihatexml.py
index 0803474..dd78563 100644
--- a/planet/vendor/html5lib/ihatexml.py
+++ b/planet/vendor/html5lib/ihatexml.py
@@ -72,44 +72,38 @@ def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
- rv.append(intToUnicodeStr(item[0]))
+ rv.append(escapeRegexp(unichr(item[0])))
else:
- rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
- return "[%s]"%"|".join(rv)
+ rv.append(escapeRegexp(unichr(item[0])) + "-" +
+ escapeRegexp(unichr(item[1])))
+ return "[%s]"%"".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
-def intToUnicodeStr(intValue):
- #There must be a better (non-evil) way to do this
- return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
-
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
- string = string.replace(char, r"\\" + char)
+ string = string.replace(char, "\\" + char)
if char in string:
print string
return string
#output from the above
-nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
- def __init__(self, replaceChars = None,
- replaceRanges = None,
+ def __init__(self, replaceChars = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
- if replaceRanges is not None or replaceChars is not None:
- raise NotImplementedError
- else:
- self.replaceCharsRegexp = nonXmlBMPRegexp
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
@@ -147,14 +141,27 @@ class InfosetFilter(object):
return data
def toXmlName(self, name):
- replaceChars = set(self.replaceCharsRegexp.findall(name))
+ nameFirst = name[0]
+ nameRest = name[1:]
+ m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+ if m:
+ nameFirstOutput = self.getReplacementCharacter(nameFirst)
+ else:
+ nameFirstOutput = nameFirst
+
+ nameRestOutput = nameRest
+ replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
- if char in self.replaceCache:
- replacement = self.replaceCache[char]
- else:
- replacement = self.escapeChar(char)
- name = name.replace(char, replacement)
- return name
+ replacement = self.getReplacementCharacter(char)
+ nameRestOutput = nameRestOutput.replace(char, replacement)
+ return nameFirstOutput + nameRestOutput
+
+ def getReplacementCharacter(self, char):
+ if char in self.replaceCache:
+ replacement = self.replaceCache[char]
+ else:
+ replacement = self.escapeChar(char)
+ return replacement
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py
index bec848f..46f46b1 100644
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@@ -5,6 +5,7 @@ import sys
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings, ReparseException
+import utils
#Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
@@ -158,7 +159,6 @@ class HTMLInputStream:
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
-
self.reset()
def reset(self):
@@ -382,14 +382,9 @@ class HTMLInputStream:
codepoint = ord(match.group())
pos = match.start()
#Pretty sure there should be endianness issues here
- if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
- pos < len(data) - 1 and
- ord(data[pos + 1]) >= 0xDC00 and
- ord(data[pos + 1]) <= 0xDFFF):
+ if utils.isSurrogatePair(data[pos:pos+2]):
#We have a surrogate pair!
- #From a perl manpage
- char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
- (ord(data[pos + 1]) - 0xDC00))
+ char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
@@ -449,6 +444,20 @@ class HTMLInputStream:
r = u"".join(rv)
return r
+ def charsUntilEOF(self):
+ """ Returns a string of characters from the stream up to EOF."""
+
+ rv = []
+
+ while True:
+ rv.append(self.chunk[self.chunkOffset:])
+ if not self.readChunk():
+ # Reached EOF
+ break
+
+ r = u"".join(rv)
+ return r
+
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
@@ -471,7 +480,7 @@ class EncodingBytes(str):
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
- return str.__new__(self, value)
+ return str.__new__(self, value.lower())
def __init__(self, value):
self._position=-1
@@ -539,14 +548,12 @@ class EncodingBytes(str):
self._position = p
return None
- def matchBytes(self, bytes, lower=False):
+ def matchBytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p+len(bytes)]
- if lower:
- data = data.lower()
rv = data.startswith(bytes)
if rv:
self.position += len(bytes)
@@ -557,6 +564,9 @@ class EncodingBytes(str):
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
+ # XXX: This is ugly, but I can't see a nicer way to fix this.
+ if self._position == -1:
+ self._position = 0
self._position += (newPosition + len(bytes)-1)
return True
else:
@@ -581,7 +591,7 @@ class EncodingParser(object):
for byte in self.data:
keepParsing = True
for key, method in methodDispatch:
- if self.data.matchBytes(key, lower=True):
+ if self.data.matchBytes(key):
try:
keepParsing = method()
break
@@ -659,59 +669,59 @@ class EncodingParser(object):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
+ # Step 1 (skip chars)
c = data.skip(spaceCharactersBytes | frozenset("/"))
- if c == "<":
- data.previous()
- return None
- elif c == ">" or c is None:
+ # Step 2
+ if c in (">", None):
return None
+ # Step 3
attrName = []
attrValue = []
- spaceFound = False
- #Step 5 attribute name
+ #Step 4 attribute name
while True:
if c == "=" and attrName:
break
elif c in spaceCharactersBytes:
- spaceFound=True
+ #Step 6!
+ c = data.skip()
+ c = data.next()
break
- elif c in ("/", "<", ">"):
+ elif c in ("/", ">"):
return "".join(attrName), ""
elif c in asciiUppercaseBytes:
attrName.append(c.lower())
+ elif c == None:
+ return None
else:
attrName.append(c)
- #Step 6
+ #Step 5
c = data.next()
#Step 7
- if spaceFound:
- c = data.skip()
- #Step 8
- if c != "=":
- data.previous()
- return "".join(attrName), ""
- #XXX need to advance position in both spaces and value case
- #Step 9
+ if c != "=":
+ data.previous()
+ return "".join(attrName), ""
+ #Step 8
data.next()
- #Step 10
+ #Step 9
c = data.skip()
- #Step 11
+ #Step 10
if c in ("'", '"'):
- #11.1
+ #10.1
quoteChar = c
while True:
- #11.3
+ #10.2
c = data.next()
+ #10.3
if c == quoteChar:
data.next()
return "".join(attrName), "".join(attrValue)
- #11.4
+ #10.4
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
- #11.5
+ #10.5
else:
attrValue.append(c)
- elif c in (">", "<"):
+ elif c == ">":
return "".join(attrName), ""
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
@@ -719,12 +729,15 @@ class EncodingParser(object):
return None
else:
attrValue.append(c)
+ # Step 11
while True:
c = data.next()
if c in spacesAngleBrackets:
return "".join(attrName), "".join(attrValue)
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
+ elif c is None:
+ return None
else:
attrValue.append(c)
@@ -734,10 +747,6 @@ class ContentAttrParser(object):
self.data = data
def parse(self):
try:
- #Skip to the first ";"
- self.data.jumpTo(";")
- self.data.position += 1
- self.data.skip()
#Check if the attr name is charset
#otherwise return
self.data.jumpTo("charset")
@@ -753,8 +762,10 @@ class ContentAttrParser(object):
quoteMark = self.data.currentByte
self.data.position += 1
oldPosition = self.data.position
- self.data.jumpTo(quoteMark)
- return self.data[oldPosition:self.data.position]
+ if self.data.jumpTo(quoteMark):
+ return self.data[oldPosition:self.data.position]
+ else:
+ return None
else:
#Unquoted value
oldPosition = self.data.position
diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py
index 79e358f..05face9 100644
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@@ -7,15 +7,19 @@ from constants import tokenTypes
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
- 'ul', 'var']
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4',
+ 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd',
+ 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol',
+ 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre',
+ 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound',
+ 'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
+ 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', 'tr', 'tt',
+ 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
@@ -24,24 +28,35 @@ class HTMLSanitizerMixin(object):
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
- 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+ 'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
- 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
- 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
- 'xml:lang']
+ 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
+ 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
+ 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
+ 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
+ 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
+ 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
+ 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
+ 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
+ 'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
+ 'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
+ 'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
+ 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
+ 'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
+ 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
+ 'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
+ 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
+ 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
+ 'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
@@ -54,43 +69,45 @@ class HTMLSanitizerMixin(object):
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
- 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
- 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
- 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
- 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
- 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
- 'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
- 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
- 'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
- 'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
- 'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
- 'stemh', 'stemv', 'stop-color', 'stop-opacity',
- 'strikethrough-position', 'strikethrough-thickness', 'stroke',
- 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
- 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
- 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
- 'transform', 'type', 'u1', 'u2', 'underline-position',
- 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
- 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
- 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
- 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
- 'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
- 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+ 'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
+ 'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
+ 'fill-opacity', 'fill-rule', 'font-family', 'font-size',
+ 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
+ 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
+ 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
+ 'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
+ 'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
+ 'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
+ 'opacity', 'orient', 'origin', 'overline-position',
+ 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
+ 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
+ 'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
+ 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
+ 'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
+ 'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
+ 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
+ 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
+ 'transform', 'type', 'u1', 'u2', 'underline-position',
+ 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
+ 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
+ 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
+ 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
- 'xlink:href', 'xml:base']
+ 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
- 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
+ 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
+ 'mask', 'stroke']
- svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
- 'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
- 'radialGradient', 'textpath', 'tref', 'set', 'use']
+ svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
+ 'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
+ 'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
+ 'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
@@ -140,7 +157,13 @@ class HTMLSanitizerMixin(object):
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
- if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
+
+ # accommodate filters which use token_type differently
+ token_type = token["type"]
+ if token_type in tokenTypes.keys():
+ token_type = tokenTypes[token_type]
+
+ if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
if token.has_key("data"):
@@ -152,6 +175,8 @@ class HTMLSanitizerMixin(object):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
+ #remove replacement characters from unescaped characters
+ val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
@@ -170,19 +195,24 @@ class HTMLSanitizerMixin(object):
token["data"] = [[name,val] for name,val in attrs.items()]
return token
else:
- if token["type"] == tokenTypes["EndTag"]:
+ if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
- if token["type"] == tokenTypes["EmptyTag"]:
+ if token.get("selfClosing"):
token["data"]=token["data"][:-1] + "/>"
- token["type"] = tokenTypes["Characters"]
+
+ if token["type"] in tokenTypes.keys():
+ token["type"] = "Characters"
+ else:
+ token["type"] = tokenTypes["Characters"]
+
del token["name"]
return token
- elif token["type"] == tokenTypes["Comment"]:
+ elif token_type == tokenTypes["Comment"]:
pass
else:
return token
diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py
index a2e2f45..45f1d06 100644
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@@ -8,8 +8,8 @@ import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
-from html5lib.constants import rcdataElements
-
+from html5lib.constants import rcdataElements, entities, xmlEntities
+from html5lib import utils
from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters)
@@ -27,20 +27,33 @@ else:
for k, v in entities.items():
if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
- encode_entity_map[v] = k
+ encode_entity_map[ord(v)] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
- for c in exc.object[exc.start:exc.end]:
- e = encode_entity_map.get(c)
+ codepoints = []
+ skip = False
+ for i, c in enumerate(exc.object[exc.start:exc.end]):
+ if skip:
+ skip = False
+ continue
+ index = i + exc.start
+ if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
+ codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
+ skip = True
+ else:
+ codepoint = ord(c)
+ codepoints.append(codepoint)
+ for cp in codepoints:
+ e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
- res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
+ res.append("&#x%s;"%(hex(cp)[2:]))
return (u"".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
@@ -54,26 +67,32 @@ def encode(text, encoding):
class HTMLSerializer(object):
+ # attribute quoting options
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
- minimize_boolean_attributes = True
+ # tag syntax options
+ omit_optional_tags = True
+ minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
+
+ # escaping options
escape_lt_in_attrs = False
escape_rcdata = False
+ resolve_entities = True
+ # miscellaneous options
inject_meta_charset = True
strip_whitespace = False
sanitize = False
- omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
- "escape_rcdata", 'use_trailing_solidus', "sanitize")
+ "escape_rcdata", "resolve_entities", "sanitize")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
@@ -103,7 +122,23 @@ class HTMLSerializer(object):
for token in treewalker:
type = token["type"]
if type == "Doctype":
- doctype = u"<!DOCTYPE %s>" % token["name"]
+ doctype = u"<!DOCTYPE %s" % token["name"]
+
+ if token["publicId"]:
+ doctype += u' PUBLIC "%s"' % token["publicId"]
+ elif token["systemId"]:
+ doctype += u" SYSTEM"
+ if token["systemId"]:
+ if token["systemId"].find(u'"') >= 0:
+ if token["systemId"].find(u"'") >= 0:
+ self.serializeError(_("System identifer contains both single and double quote characters"))
+ quote_char = u"'"
+ else:
+ quote_char = u'"'
+ doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+ doctype += u">"
+
if encoding:
yield doctype.encode(encoding)
else:
@@ -198,6 +233,19 @@ class HTMLSerializer(object):
comment = comment.encode(encoding, unicode_encode_errors)
yield comment
+ elif type == "Entity":
+ name = token["name"]
+ key = name + ";"
+ if not key in entities:
+ self.serializeError(_("Entity %s not recognized" % name))
+ if self.resolve_entities and key not in xmlEntities:
+ data = entities[key]
+ else:
+ data = u"&%s;" % name
+ if encoding:
+ data = data.encode(encoding, unicode_encode_errors)
+ yield data
+
else:
self.serializeError(token["data"])
diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py
index d884782..d7c4b5f 100644
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@@ -9,11 +9,12 @@ try:
except ImportError:
from utils import deque
-from constants import contentModelFlags, spaceCharacters
+from constants import spaceCharacters
from constants import entitiesWindows1252, entities
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
from constants import digits, hexDigits, EOF
from constants import tokenTypes, tagTokenTypes
+from constants import replacementCharacters
from inputstream import HTMLInputStream
@@ -47,7 +48,6 @@ class HTMLTokenizer:
self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state
- self.contentModelFlag = contentModelFlags["PCDATA"]
self.escapeFlag = False
self.lastFourChars = []
self.state = self.dataState
@@ -96,41 +96,43 @@ class HTMLTokenizer:
# Convert the set of characters consumed to an int.
charAsInt = int("".join(charStack), radix)
- if charAsInt == 13:
+ # Certain characters get replaced with others
+ if charAsInt in replacementCharacters:
+ char = replacementCharacters[charAsInt]
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "incorrect-cr-newline-entity"})
- charAsInt = 10
- elif 127 < charAsInt < 160:
- # If the integer is between 127 and 160 (so 128 and bigger and 159
- # and smaller) we need to do the "windows trick".
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "illegal-windows-1252-entity"})
-
- charAsInt = entitiesWindows1252[charAsInt - 128]
-
- # Certain characters get replaced with U+FFFD
- if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
- or (0x007F <= charAsInt <= 0x009F)
- or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
- or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
- or (0x10FFFF < charAsInt)):
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
+ elif ((0xD800 <= charAsInt <= 0xDFFF) or
+ (charAsInt > 0x10FFFF)):
char = u"\uFFFD"
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
else:
+ #Should speed up this check somehow (e.g. move the set to a constant)
+ if ((0x0001 <= charAsInt <= 0x0008) or
+ (0x000E <= charAsInt <= 0x001F) or
+ (0x007F <= charAsInt <= 0x009F) or
+ (0xFDD0 <= charAsInt <= 0xFDEF) or
+ charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
+ 0xFFFFF, 0x10FFFE, 0x10FFFF])):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data":
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
try:
- # XXX We should have a separate function that does "int" to
- # "unicodestring" conversion since this doesn't always work
- # according to hsivonen. Also, unichr has a limitation of 65535
+ # Try/except needed as UCS-2 Python builds' unichar only works
+ # within the BMP.
char = unichr(charAsInt)
- except:
- try:
- char = eval("u'\\U%08x'" % charAsInt)
- except:
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "cant-convert-numeric-entity",
- "datavars": {"charAsInt": charAsInt}})
+ except ValueError:
+ char = eval("u'\\U%08x'" % charAsInt)
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
@@ -146,8 +148,8 @@ class HTMLTokenizer:
output = u"&"
charStack = [self.stream.char()]
- if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \
- or (allowedChar is not None and allowedChar == charStack[0]):
+ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&")
+ or (allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0])
elif charStack[0] == u"#":
@@ -251,43 +253,14 @@ class HTMLTokenizer:
# Below are the various tokenizer states worked out.
def dataState(self):
- #XXX - consider splitting this state based on the content model flag
data = self.stream.char()
-
- # Keep a charbuffer to handle the escapeFlag
- if (self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
- if len(self.lastFourChars) == 4:
- self.lastFourChars.pop(0)
- self.lastFourChars.append(data)
-
- # The rest of the logic
- if (data == "&" and self.contentModelFlag in
- (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and
- not self.escapeFlag):
+ if data == "&":
self.state = self.entityDataState
- elif (data == "-" and self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
- not self.escapeFlag and "".join(self.lastFourChars) == "<!--"):
- self.escapeFlag = True
- self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data":data})
- elif (data == "<" and (self.contentModelFlag ==
- contentModelFlags["PCDATA"]
- or (self.contentModelFlag in
- (contentModelFlags["CDATA"],
- contentModelFlags["RCDATA"]) and
- self.escapeFlag == False))):
+ elif data == "<":
self.state = self.tagOpenState
- elif (data == ">" and self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
- self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"):
- self.escapeFlag = False
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data})
elif data is EOF:
# Tokenization ends.
return False
-
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
@@ -298,13 +271,7 @@ class HTMLTokenizer:
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
- if (self.contentModelFlag in
- (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
- chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
- self.lastFourChars += chars[-4:]
- self.lastFourChars = self.lastFourChars[-4:]
- else:
- chars = self.stream.charsUntil((u"&", u"<"))
+ chars = self.stream.charsUntil((u"&", u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
@@ -313,97 +280,108 @@ class HTMLTokenizer:
self.consumeEntity()
self.state = self.dataState
return True
-
- def tagOpenState(self):
+
+ def rcdataState(self):
data = self.stream.char()
- if self.contentModelFlag == contentModelFlags["PCDATA"]:
- if data == u"!":
- self.state = self.markupDeclarationOpenState
- elif data == u"/":
- self.state = self.closeTagOpenState
- elif data in asciiLetters:
- self.currentToken = {"type": tokenTypes["StartTag"],
- "name": data, "data": [],
- "selfClosing": False,
- "selfClosingAcknowledged": False}
- self.state = self.tagNameState
- elif data == u">":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-right-bracket"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
- self.state = self.dataState
- elif data == u"?":
- # XXX In theory it could be something besides a tag name. But
- # do we really care?
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name-but-got-question-mark"})
- self.stream.unget(data)
- self.state = self.bogusCommentState
- else:
- # XXX
- self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
- "expected-tag-name"})
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
- self.stream.unget(data)
- self.state = self.dataState
- else:
- # We know the content model flag is set to either RCDATA or CDATA
- # now because this state can never be entered with the PLAINTEXT
- # flag.
- if data == u"/":
- self.state = self.closeTagOpenState
- else:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
- self.stream.unget(data)
- self.state = self.dataState
+ if data == "&":
+ self.state = self.characterReferenceInRcdata
+ elif data == "<":
+ self.state = self.rcdataLessThanSignState
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ elif data in spaceCharacters:
+ # Directly after emitting a token you switch back to the "data
+ # state". At that point spaceCharacters are important so they are
+ # emitted separately.
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+ data + self.stream.charsUntil(spaceCharacters, True)})
+ # No need to update lastFourChars here, since the first space will
+ # have already been appended to lastFourChars and will have broken
+ # any <!-- or --> sequences
+ else:
+ chars = self.stream.charsUntil((u"&", u"<"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
return True
- def closeTagOpenState(self):
- if (self.contentModelFlag in (contentModelFlags["RCDATA"],
- contentModelFlags["CDATA"])):
-
- charStack = []
- if self.currentToken:
- # So far we know that "</" has been consumed. We now need to know
- # whether the next few characters match the name of last emitted
- # start tag which also happens to be the currentToken.
- matched = True
- for expected in self.currentToken["name"].lower():
- charStack.append(self.stream.char())
- if charStack[-1] not in (expected, expected.upper()):
- matched = False
- break
-
- # If the tag name prefix matched, we also need to check the
- # subsequent character
- if matched:
- charStack.append(self.stream.char())
- if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))):
- self.contentModelFlag = contentModelFlags["PCDATA"]
- # Unget the last character, so it can be re-processed
- # in the next state
- self.stream.unget(charStack.pop())
- # The remaining characters in charStack are the tag name
- self.currentToken = {"type": tokenTypes["EndTag"],
- "name": u"".join(charStack),
- "data": [],
- "selfClosing":False}
- self.state = self.tagNameState
- return True
-
- # Didn't find the end tag. The last character in charStack could be
- # anything, so it has to be re-processed in the data state
- self.stream.unget(charStack.pop())
+ def characterReferenceInRcdata(self):
+ self.consumeEntity()
+ self.state = self.rcdataState
+ return True
+
+ def rawtextState(self):
+ data = self.stream.char()
+ if data == "<":
+ self.state = self.rawtextLessThanSignState
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ else:
+ chars = self.stream.charsUntil((u"<"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def scriptDataState(self):
+ data = self.stream.char()
+ if data == "<":
+ self.state = self.scriptDataLessThanSignState
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ else:
+ chars = self.stream.charsUntil((u"<"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def plaintextState(self):
+ data = self.stream.char()
+ if data == EOF:
+ # Tokenization ends.
+ return False
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + self.stream.charsUntilEOF()})
+ return True
- # The remaining characters are a prefix of the tag name, so they're
- # just letters and digits, so they can be output as character
- # tokens immediately
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)})
+ def tagOpenState(self):
+ data = self.stream.char()
+ if data == u"!":
+ self.state = self.markupDeclarationOpenState
+ elif data == u"/":
+ self.state = self.closeTagOpenState
+ elif data in asciiLetters:
+ self.currentToken = {"type": tokenTypes["StartTag"],
+ "name": data, "data": [],
+ "selfClosing": False,
+ "selfClosingAcknowledged": False}
+ self.state = self.tagNameState
+ elif data == u">":
+ # XXX In theory it could be something besides a tag name. But
+ # do we really care?
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-tag-name-but-got-right-bracket"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
self.state = self.dataState
- return True
+ elif data == u"?":
+ # XXX In theory it could be something besides a tag name. But
+ # do we really care?
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-tag-name-but-got-question-mark"})
+ self.stream.unget(data)
+ self.state = self.bogusCommentState
+ else:
+ # XXX
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-tag-name"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.stream.unget(data)
+ self.state = self.dataState
+ return True
+ def closeTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
@@ -444,6 +422,373 @@ class HTMLTokenizer:
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
+
+ def rcdataLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.rcdataEndTagOpenState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.stream.unget(data)
+ self.state = self.rcdataState
+ return True
+
+ def rcdataEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ self.state = self.rcdataEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+ self.stream.unget(data)
+ self.state = self.rcdataState
+ return True
+
+ def rcdataEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": u"</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.rcdataState
+ return True
+
+ def rawtextLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.rawtextEndTagOpenState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.stream.unget(data)
+ self.state = self.rawtextState
+ return True
+
+ def rawtextEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ self.state = self.rawtextEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+ self.stream.unget(data)
+ self.state = self.rawtextState
+ return True
+
+ def rawtextEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": u"</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.rawtextState
+ return True
+
+ def scriptDataLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.scriptDataEndTagOpenState
+ elif data == "!":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"})
+ self.state = self.scriptDataEscapeStartState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ self.state = self.scriptDataEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": u"</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEscapeStartState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ self.state = self.scriptDataEscapeStartDashState
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEscapeStartDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ self.state = self.scriptDataEscapedDashDashState
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEscapedState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ self.state = self.scriptDataEscapedDashState
+ elif data == "<":
+ self.state = self.scriptDataEscapedLessThanSignState
+ elif data == EOF:
+ self.state = self.dataState
+ else:
+ chars = self.stream.charsUntil((u"<-"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def scriptDataEscapedDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ self.state = self.scriptDataEscapedDashDashState
+ elif data == "<":
+ self.state = self.scriptDataEscapedLessThanSignState
+ elif data == EOF:
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedDashDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ elif data == "<":
+ self.state = self.scriptDataEscapedLessThanSignState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
+ self.state = self.scriptDataState
+ elif data == EOF:
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.scriptDataEscapedEndTagOpenState
+ elif data in asciiLetters:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data})
+ self.temporaryBuffer = data
+ self.state = self.scriptDataDoubleEscapeStartState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer = data
+ self.state = self.scriptDataEscapedEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing":False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": u"</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataDoubleEscapeStartState(self):
+ data = self.stream.char()
+ if data in (spaceCharacters | frozenset(("/", ">"))):
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ if self.temporaryBuffer.lower() == "script":
+ self.state = self.scriptDataDoubleEscapedState
+ else:
+ self.state = self.scriptDataEscapedState
+ elif data in asciiLetters:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.temporaryBuffer += data
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataDoubleEscapedState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ self.state = self.scriptDataDoubleEscapedDashState
+ elif data == "<":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-script-in-script"})
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ return True
+
+ def scriptDataDoubleEscapedDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ self.state = self.scriptDataDoubleEscapedDashDashState
+ elif data == "<":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-script-in-script"})
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def scriptDataDoubleEscapedDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
+ elif data == "<":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
+ self.state = self.scriptDataState
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-script-in-script"})
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def scriptDataDoubleEscapedLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"})
+ self.temporaryBuffer = ""
+ self.state = self.scriptDataDoubleEscapeEndState
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def scriptDataDoubleEscapeEndState(self):
+ data = self.stream.char()
+ if data in (spaceCharacters | frozenset(("/", ">"))):
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ if self.temporaryBuffer.lower() == "script":
+ self.state = self.scriptDataEscapedState
+ else:
+ self.state = self.scriptDataDoubleEscapedState
+ elif data in asciiLetters:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.temporaryBuffer += data
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataDoubleEscapedState
+ return True
def beforeAttributeNameState(self):
data = self.stream.char()
@@ -562,7 +907,7 @@ class HTMLTokenizer:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-attribute-value-but-got-right-bracket"})
self.emitCurrentToken()
- elif data in (u"=", u"<"):
+ elif data in (u"=", u"<", u"`"):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"equals-in-unquoted-attribute-value"})
self.currentToken["data"][-1][1] += data
@@ -611,10 +956,10 @@ class HTMLTokenizer:
if data in spaceCharacters:
self.state = self.beforeAttributeNameState
elif data == u"&":
- self.processEntityInAttribute(None)
+ self.processEntityInAttribute(">")
elif data == u">":
self.emitCurrentToken()
- elif data in (u'"', u"'", u"=", u"<"):
+ elif data in (u'"', u"'", u"=", u"<", u"`"):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-character-in-unquoted-attribute-value"})
self.currentToken["data"][-1][1] += data
@@ -623,8 +968,8 @@ class HTMLTokenizer:
"eof-in-attribute-value-no-quotes"})
self.emitCurrentToken()
else:
- self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
- frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters)
+ self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
+ frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters)
return True
def afterAttributeValueState(self):
@@ -946,7 +1291,7 @@ class HTMLTokenizer:
matched = False
break
if matched:
- self.state = self.beforeDoctypePublicIdentifierState
+ self.state = self.afterDoctypePublicKeywordState
return True
elif data in (u"s", u"S"):
matched = True
@@ -957,7 +1302,7 @@ class HTMLTokenizer:
matched = False
break
if matched:
- self.state = self.beforeDoctypeSystemIdentifierState
+ self.state = self.afterDoctypeSystemKeywordState
return True
# All the characters read before the current 'data' will be
@@ -972,6 +1317,26 @@ class HTMLTokenizer:
self.state = self.bogusDoctypeState
return True
+
+ def afterDoctypePublicKeywordState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeDoctypePublicIdentifierState
+ elif data in ("'", '"'):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.stream.unget(data)
+ self.state = self.beforeDoctypePublicIdentifierState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.stream.unget(data)
+ self.state = self.beforeDoctypePublicIdentifierState
+ return True
def beforeDoctypePublicIdentifierState(self):
data = self.stream.char()
@@ -1045,17 +1410,47 @@ class HTMLTokenizer:
def afterDoctypePublicIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
- pass
- elif data == "\"":
+ self.state = self.betweenDoctypePublicAndSystemIdentifiersState
+ elif data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == '"':
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["correct"] = False
+ self.state = self.bogusDoctypeState
+ return True
+
+ def betweenDoctypePublicAndSystemIdentifiersState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
- elif data is EOF:
+ elif data == '"':
+ self.currentToken["systemId"] = u""
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
+ elif data == "'":
+ self.currentToken["systemId"] = u""
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
+ elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
@@ -1068,6 +1463,26 @@ class HTMLTokenizer:
self.state = self.bogusDoctypeState
return True
+ def afterDoctypeSystemKeywordState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeDoctypeSystemIdentifierState
+ elif data in ("'", '"'):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.stream.unget(data)
+ self.state = self.beforeDoctypeSystemIdentifierState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.stream.unget(data)
+ self.state = self.beforeDoctypeSystemIdentifierState
+ return True
+
def beforeDoctypeSystemIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
diff --git a/planet/vendor/html5lib/treebuilders/__init__.py b/planet/vendor/html5lib/treebuilders/__init__.py
index 635f426..13278de 100755
--- a/planet/vendor/html5lib/treebuilders/__init__.py
+++ b/planet/vendor/html5lib/treebuilders/__init__.py
@@ -73,7 +73,22 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
+ # Come up with a sane default
+ if implementation == None:
+ try:
+ import xml.etree.cElementTree as ET
+ except ImportError:
+ try:
+ import xml.etree.ElementTree as ET
+ except ImportError:
+ try:
+ import cElementTree as ET
+ except ImportError:
+ import elementtree.ElementTree as ET
+ implementation = ET
import etree
- # XXX: NEVER cache here, caching is done in the etree submodule
+ # NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
+ else:
+ raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
return treeBuilderCache.get(treeType)
diff --git a/planet/vendor/html5lib/treebuilders/_base.py b/planet/vendor/html5lib/treebuilders/_base.py
index 7b2ce4b..6ea5843 100755
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@@ -1,5 +1,4 @@
-import warnings
-from html5lib.constants import scopingElements, tableInsertModeElements
+from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
try:
frozenset
except NameError:
@@ -115,7 +114,6 @@ class TreeBuilder(object):
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
- warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
self.reset()
def reset(self):
@@ -130,24 +128,23 @@ class TreeBuilder(object):
self.document = self.documentClass()
- def elementInScope(self, target, tableVariant=False):
+ def elementInScope(self, target, variant=None):
# Exit early when possible.
- if self.openElements[-1].name == target:
- return True
-
- # AT Use reverse instead of [::-1] when we can rely on Python 2.4
- # AT How about while True and simply set node to [-1] and set it to
- # [-2] at the end...
- for node in self.openElements[::-1]:
+ listElementsMap = {
+ None:scopingElements,
+ "list":scopingElements | set([(namespaces["html"], "ol"),
+ (namespaces["html"], "ul")]),
+ "table":set([(namespaces["html"], "html"),
+ (namespaces["html"], "table")])
+ }
+ listElements = listElementsMap[variant]
+
+ for node in reversed(self.openElements):
if node.name == target:
return True
- elif node.name == "table":
- return False
- elif (not tableVariant and (node.nameTuple in
- scopingElements)):
- return False
- elif node.name == "html":
+ elif node.nameTuple in listElements:
return False
+
assert False # We should never reach this point
def reconstructActiveFormattingElements(self):
@@ -160,27 +157,28 @@ class TreeBuilder(object):
return
# Step 2 and step 3: we start with the last element. So i is -1.
- i = -1
+ i = len(self.activeFormattingElements) - 1
entry = self.activeFormattingElements[i]
if entry == Marker or entry in self.openElements:
return
# Step 6
while entry != Marker and entry not in self.openElements:
- # Step 5: let entry be one earlier in the list.
- i -= 1
- try:
- entry = self.activeFormattingElements[i]
- except:
- # Step 4: at this point we need to jump to step 8. By not doing
- # i += 1 which is also done in step 7 we achieve that.
+ if i == 0:
+ #This will be reset to 0 below
+ i = -1
break
+ i -= 1
+ # Step 5: let entry be one earlier in the list.
+ entry = self.activeFormattingElements[i]
+
while True:
# Step 7
i += 1
# Step 8
- clone = self.activeFormattingElements[i].cloneNode()
+ entry = self.activeFormattingElements[i]
+ clone = entry.cloneNode() #Mainly to get a new copy of the attributes
# Step 9
element = self.insertElement({"type":"StartTag",
diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py
index 8de1bdc..c094e1f 100644
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@@ -2,6 +2,7 @@
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re
+import weakref
import _base
from html5lib import constants, ihatexml
@@ -22,34 +23,30 @@ def getDomModule(DomImplementation):
def getDomBuilder(DomImplementation):
Dom = DomImplementation
- infoset_filter = ihatexml.InfosetFilter()
class AttrList:
def __init__(self, element):
self.element = element
def __iter__(self):
return self.element.attributes.items().__iter__()
def __setitem__(self, name, value):
- self.element.setAttribute(infoset_filter.coerceAttribute(name),
- infoset_filter.coerceCharacters(value))
+ self.element.setAttribute(name, value)
def items(self):
- return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
+ return [(item[0], item[1]) for item in
self.element.attributes.items()]
def keys(self):
- return [infoset_filter.fromXmlName(item) for item in
- self.element.attributes.keys()]
+ return self.element.attributes.keys()
def __getitem__(self, name):
- name = infoset_filter.toXmlName(name)
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
- return self.element.hasAttribute(infoset_filter.toXmlName(name))
+ return self.element.hasAttribute(name)
class NodeBuilder(_base.Node):
def __init__(self, element):
- _base.Node.__init__(self, element.localName)
+ _base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
@@ -60,7 +57,6 @@ def getDomBuilder(DomImplementation):
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
- data=infoset_filter.coerceCharacters(data)
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
@@ -91,17 +87,14 @@ def getDomBuilder(DomImplementation):
for name, value in attributes.items():
if isinstance(name, tuple):
if name[0] is not None:
- qualifiedName = (name[0] + ":" +
- infoset_filter.coerceAttribute(
- name[1]))
+ qualifiedName = (name[0] + ":" + name[1])
else:
- qualifiedName = infoset_filter.coerceAttribute(
- name[1])
+ qualifiedName = name[1]
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
- infoset_filter.coerceAttribute(name), value)
+ name, value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
@@ -121,7 +114,7 @@ def getDomBuilder(DomImplementation):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
- return self
+ return weakref.proxy(self)
def insertDoctype(self, token):
name = token["name"]
@@ -161,7 +154,7 @@ def getDomBuilder(DomImplementation):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
- data=infoset_filter.coerceCharacters(data)
+ data=data
if parent <> self:
_base.TreeBuilder.insertText(self, data, parent)
else:
@@ -199,8 +192,7 @@ def getDomBuilder(DomImplementation):
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
- element.namespaceURI not in (None,
- constants.namespaces["html"])):
+ element.namespaceURI != None):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
else:
@@ -210,11 +202,13 @@ def getDomBuilder(DomImplementation):
i = 0
attr = element.attributes.item(i)
while attr:
- name = infoset_filter.fromXmlName(attr.localName)
+ name = attr.nodeName
value = attr.value
ns = attr.namespaceURI
if ns:
- name = "%s %s"%(constants.prefixes[ns], name)
+ name = "%s %s"%(constants.prefixes[ns], attr.localName)
+ else:
+ name = attr.nodeName
i += 1
attr = element.attributes.item(i)
@@ -241,12 +235,12 @@ def getDomBuilder(DomImplementation):
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
- prefix = (attr.localName != 'xmlns' and attr.localName or None)
+ prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
- del attributes[(attr.namespaceURI, attr.localName)]
+ del attributes[(attr.namespaceURI, attr.nodeName)]
# apply namespace declarations
for attrname in node.attributes.keys():
@@ -254,8 +248,8 @@ def getDomBuilder(DomImplementation):
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
- del attributes[(attr.namespaceURI, attr.localName)]
- attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
+ del attributes[(attr.namespaceURI, attr.nodeName)]
+ attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
diff --git a/planet/vendor/html5lib/treebuilders/etree.py b/planet/vendor/html5lib/treebuilders/etree.py
index 6815582..62918f8 100755
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@@ -131,7 +131,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
self._element.text += data
def cloneNode(self):
- element = Element(self.name)
+ element = Element(self.name, self.namespace)
for name, value in self.attributes.iteritems():
element.attributes[name] = value
return element
@@ -227,8 +227,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
- if prefix != "html":
- name = "%s %s"%(prefix, name)
+ name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"):
@@ -322,7 +321,11 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if fullTree:
return self.document._element
else:
- return self.document._element.find("html")
+ if self.defaultNamespace is not None:
+ return self.document._element.find(
+ "{%s}html"%self.defaultNamespace)
+ else:
+ return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element
diff --git a/planet/vendor/html5lib/treebuilders/etree_lxml.py b/planet/vendor/html5lib/treebuilders/etree_lxml.py
index 92f0f87..80a4005 100644
--- a/planet/vendor/html5lib/treebuilders/etree_lxml.py
+++ b/planet/vendor/html5lib/treebuilders/etree_lxml.py
@@ -86,12 +86,8 @@ def testSerializer(element):
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
- if prefix != "html":
- rv.append("|%s<%s %s>"%(' '*indent, prefix,
- filter.fromXmlName(tag)))
- else:
- rv.append("|%s<%s>"%(' '*indent,
- filter.fromXmlName(tag)))
+ rv.append("|%s<%s %s>"%(' '*indent, prefix,
+ filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
@@ -207,12 +203,12 @@ class TreeBuilder(_base.TreeBuilder):
self._attributes = Attributes(self)
def _setName(self, name):
- self._name = filter.coerceElement(name)
+ self._name = filter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
- return self._name
+ return filter.fromXmlName(self._name)
name = property(_getName, _setName)
@@ -281,8 +277,9 @@ class TreeBuilder(_base.TreeBuilder):
publicId = token["publicId"]
systemId = token["systemId"]
- if not name or ihatexml.nonXmlBMPRegexp.search(name):
+ if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
@@ -296,15 +293,14 @@ class TreeBuilder(_base.TreeBuilder):
#Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal
docStr = ""
- if self.doctype and self.doctype.name:
+ if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "")
docStr += ">"
- #TODO - this needs to work when elements are not put into the default ns
- docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
+ docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
try:
root = etree.fromstring(docStr)
@@ -320,9 +316,17 @@ class TreeBuilder(_base.TreeBuilder):
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
+ # Give the root element the right name
+ name = token["name"]
+ namespace = token.get("namespace", self.defaultNamespace)
+ if namespace is None:
+ etree_tag = name
+ else:
+ etree_tag = "{%s}%s"%(namespace, name)
+ root.tag = etree_tag
+
#Add the root element to the internal child/open data structures
- namespace = token.get("namespace", None)
- root_element = self.elementClass(token["name"], namespace)
+ root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
diff --git a/planet/vendor/html5lib/treebuilders/simpletree.py b/planet/vendor/html5lib/treebuilders/simpletree.py
index 6d92892..ff6bfe4 100755
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ b/planet/vendor/html5lib/treebuilders/simpletree.py
@@ -62,14 +62,7 @@ class Node(_base.Node):
node.parent = None
def cloneNode(self):
- newNode = type(self)(self.name)
- if hasattr(self, 'namespace'):
- newNode.namespace = self.namespace
- if hasattr(self, 'attributes'):
- for attr, value in self.attributes.iteritems():
- newNode.attributes[attr] = value
- newNode.value = self.value
- return newNode
+ raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text"""
@@ -112,11 +105,17 @@ class Document(Node):
tree += child.printTree(2)
return tree
+ def cloneNode(self):
+ return Document()
+
class DocumentFragment(Document):
type = 2
def __unicode__(self):
return "#document-fragment"
+ def cloneNode(self):
+ return DocumentFragment()
+
class DocumentType(Node):
type = 3
def __init__(self, name, publicId, systemId):
@@ -140,6 +139,9 @@ class DocumentType(Node):
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
+ def cloneNode(self):
+ return DocumentType(self.name, self.publicId, self.systemId)
+
class TextNode(Node):
type = 4
def __init__(self, value):
@@ -154,6 +156,9 @@ class TextNode(Node):
hilite = toxml
+ def cloneNode(self):
+ return TextNode(self.value)
+
class Element(Node):
type = 5
def __init__(self, name, namespace=None):
@@ -162,7 +167,7 @@ class Element(Node):
self.attributes = {}
def __unicode__(self):
- if self.namespace in (None, namespaces["html"]):
+ if self.namespace == None:
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
@@ -206,6 +211,14 @@ class Element(Node):
tree += child.printTree(indent)
return tree
+ def cloneNode(self):
+ newNode = Element(self.name)
+ if hasattr(self, 'namespace'):
+ newNode.namespace = self.namespace
+ for attr, value in self.attributes.iteritems():
+ newNode.attributes[attr] = value
+ return newNode
+
class CommentNode(Node):
type = 6
def __init__(self, data):
@@ -221,6 +234,9 @@ class CommentNode(Node):
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
+ def cloneNode(self):
+ return CommentNode(self.data)
+
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
diff --git a/planet/vendor/html5lib/treebuilders/soup.py b/planet/vendor/html5lib/treebuilders/soup.py
index 367de06..bca2baf 100644
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ b/planet/vendor/html5lib/treebuilders/soup.py
@@ -1,5 +1,7 @@
import warnings
+warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
+
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
@@ -134,6 +136,11 @@ class TextNode(Element):
raise NotImplementedError
class TreeBuilder(_base.TreeBuilder):
+ def __init__(self, namespaceHTMLElements):
+ if namespaceHTMLElements:
+ warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
+ _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup, None)
@@ -144,16 +151,16 @@ class TreeBuilder(_base.TreeBuilder):
systemId = token["systemId"]
if publicId:
- self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
+ self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
- self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
+ self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
(name, systemId)))
else:
- self.soup.insert(0, Declaration(name))
+ self.soup.insert(0, Declaration("DOCTYPE %s"%name))
def elementClass(self, name, namespace):
- if namespace not in (None, namespaces["html"]):
- warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
+ if namespace is not None:
+ warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data):
@@ -181,7 +188,7 @@ def testSerializer(element):
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
- doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
+ doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
diff --git a/planet/vendor/html5lib/treewalkers/_base.py b/planet/vendor/html5lib/treewalkers/_base.py
index 2b192bd..4128be1 100644
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@@ -60,9 +60,13 @@ class TreeWalker(object):
def doctype(self, name, publicId=None, systemId=None, correct=True):
return {"type": "Doctype",
"name": name is not None and unicode(name) or u"",
- "publicId": publicId, "systemId": systemId,
+ "publicId": publicId,
+ "systemId": systemId,
"correct": correct}
+ def entity(self, name):
+ return {"type": "Entity", "name": unicode(name)}
+
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
@@ -88,6 +92,7 @@ DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
@@ -121,7 +126,8 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
- for token in self.emptyTag(namespace, name, attributes, hasChildren):
+ for token in self.emptyTag(namespace, name, attributes,
+ hasChildren):
yield token
hasChildren = False
else:
@@ -131,6 +137,9 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == COMMENT:
yield self.comment(details[0])
+ elif type == ENTITY:
+ yield self.entity(details[0])
+
elif type == DOCUMENT:
hasChildren = True
@@ -152,11 +161,12 @@ class NonRecursiveTreeWalker(TreeWalker):
namespace, name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(namespace, name)
+ if self.tree is currentNode:
+ currentNode = None
+ break
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
- if self.tree is currentNode:
- currentNode = None
else:
currentNode = self.getParentNode(currentNode)
diff --git a/planet/vendor/html5lib/treewalkers/dom.py b/planet/vendor/html5lib/treewalkers/dom.py
index c2b0712..0adc77f 100644
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@@ -4,7 +4,6 @@ import gettext
_ = gettext.gettext
import _base
-
from html5lib.constants import voidElements
class TreeWalker(_base.NonRecursiveTreeWalker):
diff --git a/planet/vendor/html5lib/treewalkers/genshistream.py b/planet/vendor/html5lib/treewalkers/genshistream.py
index 0014073..ef71a83 100644
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@@ -1,5 +1,5 @@
-from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
- START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
+from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
import _base
@@ -49,7 +49,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == END:
name = data.localname
namespace = data.namespace
- if (namespace, name) not in voidElements:
+ if name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:
diff --git a/planet/vendor/html5lib/treewalkers/lxmletree.py b/planet/vendor/html5lib/treewalkers/lxmletree.py
index 3f4de4f..2c38aff 100644
--- a/planet/vendor/html5lib/treewalkers/lxmletree.py
+++ b/planet/vendor/html5lib/treewalkers/lxmletree.py
@@ -96,6 +96,9 @@ class FragmentWrapper(object):
def __str__(self):
return str(self.obj)
+ def __unicode__(self):
+ return unicode(self.obj)
+
def __len__(self):
return len(self.obj)
@@ -126,6 +129,9 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
elif node.tag == etree.Comment:
return _base.COMMENT, node.text
+ elif node.tag == etree.Entity:
+ return _base.ENTITY, node.text[1:-1] # strip &;
+
else:
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
diff --git a/planet/vendor/html5lib/treewalkers/soup.py b/planet/vendor/html5lib/treewalkers/soup.py
index ae29f03..fca65ec 100644
--- a/planet/vendor/html5lib/treewalkers/soup.py
+++ b/planet/vendor/html5lib/treewalkers/soup.py
@@ -3,12 +3,12 @@ import gettext
_ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
-
+from html5lib.constants import namespaces
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
- r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
+ r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
@@ -26,6 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
+
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
@@ -44,8 +45,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node
elif isinstance(node, Tag): # Element
- return _base.ELEMENT, node.name, \
- dict(node.attrs).items(), node.contents
+ return (_base.ELEMENT, namespaces["html"], node.name,
+ dict(node.attrs).items(), node.contents)
else:
return _base.UNKNOWN, node.__class__.__name__
diff --git a/planet/vendor/html5lib/utils.py b/planet/vendor/html5lib/utils.py
index 7c6c8ae..d53f678 100644
--- a/planet/vendor/html5lib/utils.py
+++ b/planet/vendor/html5lib/utils.py
@@ -153,4 +153,23 @@ class deque(object):
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
- return result \ No newline at end of file
+ return result
+
+#Some utility functions to dal with weirdness around UCS2 vs UCS4
+#python builds
+
+def encodingType():
+ if len() == 2:
+ return "UCS2"
+ else:
+ return "UCS4"
+
+def isSurrogatePair(data):
+ return (len(data) == 2 and
+ ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
+ ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
+
+def surrogatePairToCodepoint(data):
+ char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
+ (ord(data[1]) - 0xDC00))
+ return char_val
diff --git a/planet/vendor/httplib2/__init__.py b/planet/vendor/httplib2/__init__.py
index 56c018b..ee65304 100644
--- a/planet/vendor/httplib2/__init__.py
+++ b/planet/vendor/httplib2/__init__.py
@@ -353,7 +353,7 @@ def _decompressContent(response, new_content):
# Record the historical presence of the encoding in a way the won't interfere.
response['-content-encoding'] = response['content-encoding']
del response['content-encoding']
- except IOError:
+ except (IOError, zlib.error), e:
content = ""
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
return content
@@ -884,6 +884,7 @@ the same interface as FileCache."""
if auth:
auth.request(method, request_uri, headers, body)
+ conn.connect()
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
if auth:
diff --git a/planet/vendor/pubsubhubbub_publisher/PKG-INFO b/planet/vendor/pubsubhubbub_publisher/PKG-INFO
new file mode 100644
index 0000000..072227a
--- /dev/null
+++ b/planet/vendor/pubsubhubbub_publisher/PKG-INFO
@@ -0,0 +1,10 @@
+Metadata-Version: 1.0
+Name: PubSubHubbub_Publisher
+Version: 1.0
+Summary: Publisher client for PubSubHubbub
+Home-page: http://code.google.com/p/pubsubhubbub/
+Author: Brett Slatkin
+Author-email: bslatkin@gmail.com
+License: Apache 2.0
+Description: A simple, open, server-to-server web-hook-based pubsub (publish/subscribe) protocol as a simple extension to Atom. Parties (servers) speaking the PubSubHubbub protocol can get near-instant notifications (via webhook callbacks) when a topic (Atom URL) they're interested in is updated.
+Platform: UNKNOWN
diff --git a/planet/vendor/pubsubhubbub_publisher/__init__.py b/planet/vendor/pubsubhubbub_publisher/__init__.py
new file mode 100644
index 0000000..d9dbb68
--- /dev/null
+++ b/planet/vendor/pubsubhubbub_publisher/__init__.py
@@ -0,0 +1,2 @@
+from pubsubhubbub_publish import *
+
diff --git a/planet/vendor/pubsubhubbub_publisher/pubsubhubbub_publish.py b/planet/vendor/pubsubhubbub_publisher/pubsubhubbub_publish.py
new file mode 100644
index 0000000..9ae6e66
--- /dev/null
+++ b/planet/vendor/pubsubhubbub_publisher/pubsubhubbub_publish.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+#
+# Copyright 2009 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Simple Publisher client for PubSubHubbub.
+
+Example usage:
+
+ from pubsubhubbub_publish import *
+ try:
+ publish('http://pubsubhubbub.appspot.com',
+ 'http://example.com/feed1/atom.xml',
+ 'http://example.com/feed2/atom.xml',
+ 'http://example.com/feed3/atom.xml')
+ except PublishError, e:
+ # handle exception...
+
+Set the 'http_proxy' environment variable on *nix or Windows to use an
+HTTP proxy.
+"""
+
+__author__ = 'bslatkin@gmail.com (Brett Slatkin)'
+
+import urllib
+import urllib2
+
+
+class PublishError(Exception):
+ """An error occurred while trying to publish to the hub."""
+
+
+URL_BATCH_SIZE = 100
+
+
+def publish(hub, *urls):
+ """Publishes an event to a hub.
+
+ Args:
+ hub: The hub to publish the event to.
+ **urls: One or more URLs to publish to. If only a single URL argument is
+ passed and that item is an iterable that is not a string, the contents of
+ that iterable will be used to produce the list of published URLs. If
+ more than URL_BATCH_SIZE URLs are supplied, this function will batch them
+ into chunks across multiple requests.
+
+ Raises:
+ PublishError if anything went wrong during publishing.
+ """
+ if len(urls) == 1 and not isinstance(urls[0], basestring):
+ urls = list(urls[0])
+
+ for i in xrange(0, len(urls), URL_BATCH_SIZE):
+ chunk = urls[i:i+URL_BATCH_SIZE]
+ data = urllib.urlencode(
+ {'hub.url': chunk, 'hub.mode': 'publish'}, doseq=True)
+ try:
+ response = urllib2.urlopen(hub, data)
+ except (IOError, urllib2.HTTPError), e:
+ if hasattr(e, 'code') and e.code == 204:
+ continue
+ error = ''
+ if hasattr(e, 'read'):
+ error = e.read()
+ raise PublishError('%s, Response: "%s"' % (e, error))
diff --git a/publish.py b/publish.py
new file mode 100755
index 0000000..fbbfd07
--- /dev/null
+++ b/publish.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+"""
+Main program to run just the splice portion of planet
+"""
+
+import os.path
+import sys
+from planet import publish, config
+
+if __name__ == '__main__':
+
+ if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
+ config.load(sys.argv[1])
+ publish.publish(config)
+ else:
+ print "Usage:"
+ print " python %s config.ini" % sys.argv[0]
diff --git a/tests/data/config/basic.csv b/tests/data/config/basic.csv
index b7e4178..d5ea28a 100644
--- a/tests/data/config/basic.csv
+++ b/tests/data/config/basic.csv
@@ -1,3 +1,5 @@
url,name,filters
feed1,one
+feed1,one
+feed2,two,bar
feed2,two,bar
diff --git a/tests/data/config/basic.ini b/tests/data/config/basic.ini
index bb6ec46..f5716a4 100644
--- a/tests/data/config/basic.ini
+++ b/tests/data/config/basic.ini
@@ -5,6 +5,7 @@ template_files = index.html.tmpl atom.xml.tmpl
items_per_page = 50
filters = foo
feed_timeout=30
+pubsubhubbub_hub = http://pubsubhubbub.appspot.com
[index.html.tmpl]
days_per_page = 7
diff --git a/tests/data/filter/django/test.xml b/tests/data/filter/django/test.xml
index 323a3e8..1d9882a 100644
--- a/tests/data/filter/django/test.xml
+++ b/tests/data/filter/django/test.xml
@@ -10,7 +10,7 @@
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
- <title>&#161;Atom-Powered Robots Run Amok!</title>
+ <title type='xhtml'>&#161;Atom-Powered <b>Robots</b> Run Amok!</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
diff --git a/tests/data/reconstitute/content_illegal_char.xml b/tests/data/reconstitute/content_illegal_char.xml
index cf4e53f..b7c7c08 100644
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@@ -1,6 +1,6 @@
<!--
Description: illegal control character
-Expect: content[0].value == u'Page 1 Page 2'
+Expect: 'U+000c' in content[0].value
-->
<feed xmns="http://www.w3.org/2005/Atom">
diff --git a/tests/data/reconstitute/dc_date_taken.xml b/tests/data/reconstitute/dc_date_taken.xml
new file mode 100644
index 0000000..3d83960
--- /dev/null
+++ b/tests/data/reconstitute/dc_date_taken.xml
@@ -0,0 +1,12 @@
+<!--
+Description: Dublin Core date: Taken (as used by e.g. flickr)
+Expect: dc_date_taken == '2010-10-15T16:10:05-01:00'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom"
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
+ <entry>
+ <dc:date.Taken>2010-10-15T16:10:05-01:00</dc:date.Taken>
+ </entry>
+</feed>
+
diff --git a/tests/data/reconstitute/georss_box_latlong.xml b/tests/data/reconstitute/georss_box_latlong.xml
new file mode 100644
index 0000000..4973fe8
--- /dev/null
+++ b/tests/data/reconstitute/georss_box_latlong.xml
@@ -0,0 +1,10 @@
+<!--
+Description: box inside an entry (center point calculated)
+Expect: geo_lat == '42.991000' and geo_long == '-70.444000'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <entry>
+ <georss:box>42.943 -71.032 43.039 -69.856</georss:box>
+ </entry>
+</feed>
diff --git a/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml b/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml
new file mode 100644
index 0000000..396da5e
--- /dev/null
+++ b/tests/data/reconstitute/georss_placeboxpolygon_latlong.xml
@@ -0,0 +1,15 @@
+<!--
+Description: polygon inside bounding box inside place inside an entry
+Expect: geo_lat == '34.052610' and geo_long == '-118.432212'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom"
+ xmlns:twitter="http://api.twitter.com">
+ <entry>
+ <twitter:place xmlns:georss="http://www.georss.org/georss">
+ <twitter:bounding_box>
+ <georss:polygon>34.05260997 -118.43221212 34.05260997 -118.37216196 34.11240804 -118.37216196 34.11240804 -118.43221212</georss:polygon>
+ </twitter:bounding_box>
+ </twitter:place>
+ </entry>
+</feed>
diff --git a/tests/data/reconstitute/georss_point_latlong.xml b/tests/data/reconstitute/georss_point_latlong.xml
new file mode 100644
index 0000000..f21deea
--- /dev/null
+++ b/tests/data/reconstitute/georss_point_latlong.xml
@@ -0,0 +1,11 @@
+<!--
+Description: point inside an entry
+Expect: geo_lat == '34.101646' and geo_long == '-118.326454'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom"
+ xmlns:twitter="http://api.twitter.com">
+ <entry>
+ <georss:point>34.10164620,-118.32645359</georss:point>
+ </entry>
+</feed>
diff --git a/tests/data/reconstitute/georss_polygon_latlong.xml b/tests/data/reconstitute/georss_polygon_latlong.xml
new file mode 100644
index 0000000..e005b93
--- /dev/null
+++ b/tests/data/reconstitute/georss_polygon_latlong.xml
@@ -0,0 +1,10 @@
+<!--
+Description: polygon inside an entry
+Expect: geo_lat == '34.052610' and geo_long == '-118.432212'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <entry>
+ <georss:polygon>34.052610 -118.432212 34.05260997 -118.37216196 34.11240804 -118.37216196 34.11240804 -118.43221212</georss:polygon>
+ </entry>
+</feed>
diff --git a/tests/data/reconstitute/gr_id.xml b/tests/data/reconstitute/gr_id.xml
new file mode 100644
index 0000000..eca22b7
--- /dev/null
+++ b/tests/data/reconstitute/gr_id.xml
@@ -0,0 +1,11 @@
+<!--
+Description: id
+Expect: id == 'http://example.com/2'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom">
+ <entry xmlns:gr="http://www.google.com/schemas/reader/atom/">
+ <id gr:original-id="http://example.com/2">http://example.com/1</id>
+ </entry>
+</feed>
+
diff --git a/tests/data/reconstitute/rss_source.xml b/tests/data/reconstitute/rss_source.xml
index 28acaa5..a00325c 100644
--- a/tests/data/reconstitute/rss_source.xml
+++ b/tests/data/reconstitute/rss_source.xml
@@ -1,6 +1,6 @@
<!--
Description: source element
-Expect: source.title == 'foo'
+Expect: source.links[0].title == 'org' and source.links[0].href == 'http://www.example.org'
-->
<rss version="2.0">
diff --git a/tests/data/spider/config.ini b/tests/data/spider/config.ini
index 7a6c5e7..1b8eec1 100644
--- a/tests/data/spider/config.ini
+++ b/tests/data/spider/config.ini
@@ -1,6 +1,7 @@
[Planet]
name = test planet
cache_directory = tests/work/spider/cache
+cache_blasklist_directory = tests/work/spider/cache/blacklist
[tests/data/spider/testfeed0.atom]
name = not found
diff --git a/tests/reconstitute.py b/tests/reconstitute.py
index dd57b92..58b506e 100644
--- a/tests/reconstitute.py
+++ b/tests/reconstitute.py
@@ -71,7 +71,8 @@ if __name__ == "__main__":
child = source.firstChild
source.removeChild(child)
feed.insertBefore(child, source)
- for source in doc.getElementsByTagName('source'):
+ atomNS='http://www.w3.org/2005/Atom'
+ for source in doc.getElementsByTagNameNS(atomNS, 'source'):
source.parentNode.removeChild(source)
splice.apply(doc.toxml('utf-8'))
diff --git a/tests/test_config.py b/tests/test_config.py
index e21bfb6..2bd52bb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -30,6 +30,9 @@ class ConfigTest(unittest.TestCase):
def test_link(self):
self.assertEqual('http://example.com/', config.link())
+ def test_pubsubhubbub_hub(self):
+ self.assertEqual('http://pubsubhubbub.appspot.com', config.pubsubhubbub_hub())
+
# per template configuration
def test_days_per_page(self):
diff --git a/tests/test_docs.py b/tests/test_docs.py
index f9a2310..a0198db 100644
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@@ -1,18 +1,29 @@
#!/usr/bin/env python
-import unittest, os
+import unittest, os, re
from xml.dom import minidom
from glob import glob
+from htmlentitydefs import name2codepoint as n2cp
class DocsTest(unittest.TestCase):
def test_well_formed(self):
+ def substitute_entity(match):
+ ent = match.group(1)
+ try:
+ return "&#%d;" % n2cp[ent]
+ except:
+ return "&%s;" % ent
+
for doc in glob('docs/*'):
if os.path.isdir(doc): continue
if doc.endswith('.css') or doc.endswith('.js'): continue
+ source = open(doc).read()
+ source = re.sub('&(\w+);', substitute_entity, source)
+
try:
- minidom.parse(doc)
+ minidom.parseString(source)
except:
self.fail('Not well formed: ' + doc);
break
diff --git a/tests/test_filter_django.py b/tests/test_filter_django.py
index 67774b8..5be54a0 100644
--- a/tests/test_filter_django.py
+++ b/tests/test_filter_django.py
@@ -24,7 +24,17 @@ class DjangoFilterTests(unittest.TestCase):
input = feed.read(); feed.close()
results = dj.run(
os.path.realpath('tests/data/filter/django/title.html.dj'), input)
- self.assertEqual(results, "\xc2\xa1Atom-Powered Robots Run Amok!\n")
+ self.assertEqual(results,
+ u"\xa1Atom-Powered &lt;b&gt;Robots&lt;/b&gt; Run Amok!\n")
+
+ def test_django_entry_title_autoescape_off(self):
+ config.load('tests/data/filter/django/test.ini')
+ config.parser.set('Planet', 'django_autoescape', 'off')
+ feed = open('tests/data/filter/django/test.xml')
+ input = feed.read(); feed.close()
+ results = dj.run(
+ os.path.realpath('tests/data/filter/django/title.html.dj'), input)
+ self.assertEqual(results, u"\xa1Atom-Powered <b>Robots</b> Run Amok!\n")
def test_django_config_context(self):
config.load('tests/data/filter/django/test.ini')
diff --git a/tests/test_reconstitute.py b/tests/test_reconstitute.py
index 754ea7a..57c92e1 100644
--- a/tests/test_reconstitute.py
+++ b/tests/test_reconstitute.py
@@ -29,7 +29,8 @@ class ReconstituteTest(unittest.TestCase):
# verify the results
results = feedparser.parse(work.getvalue().encode('utf-8'))
- self.assertFalse(results.bozo, 'xml is well formed')
+ if 'illegal' not in name:
+ self.assertFalse(results.bozo, 'xml is well formed')
if not self.simple_re.match(expect):
self.assertTrue(eval(expect, results.entries[0]), expect)
else:
diff --git a/tests/test_spider.py b/tests/test_spider.py
index 9ffba8e..183566c 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -82,6 +82,27 @@ class SpiderTest(unittest.TestCase):
self.spiderFeed(testfeed % '1b')
self.assertEqual(1, len(glob.glob(workdir+"/*")))
+ def test_spiderFeed_blacklist(self):
+ config.load(configfile)
+ self.spiderFeed(testfeed % '1b')
+
+ # verify that exactly four entries were produced
+ self.assertEqual(4, len(glob.glob(workdir+"/planet*")))
+
+ # verify that the file names are as expected
+ self.assertTrue(os.path.exists(os.path.join(workdir,
+ 'planet.intertwingly.net,2006,testfeed1,1')))
+
+ os.mkdir(os.path.join(workdir, "blacklist"))
+
+ os.rename(os.path.join(workdir,
+ 'planet.intertwingly.net,2006,testfeed1,1'),
+ os.path.join(workdir, "blacklist",
+ 'planet.intertwingly.net,2006,testfeed1,1'))
+
+ self.spiderFeed(testfeed % '1b')
+ self.assertEqual(3, len(glob.glob(workdir+"/planet*")))
+
def test_spiderUpdate(self):
config.load(configfile)
self.spiderFeed(testfeed % '1a')
diff --git a/themes/asf/default.css b/themes/asf/default.css
index a23be74..40408bd 100644
--- a/themes/asf/default.css
+++ b/themes/asf/default.css
@@ -72,6 +72,7 @@ body > h1 {
border-bottom: 2px solid #ccd;
-webkit-border-bottom-left-radius: 1em;
-moz-border-radius: 0 0 0 1em;
+ border-radius: 0 0 0 1em;
}
#sidebar h2 {
@@ -88,6 +89,7 @@ body > h1 {
-webkit-border-top-left-radius: 6px;
-webkit-border-bottom-left-radius: 6px;
-moz-border-radius: 6px 0 0 6px;
+ border-radius: 6px 0 0 6px;
}
#sidebar h2 a img {
@@ -138,6 +140,7 @@ body > h1 {
background-color: #EEE;
-webkit-border-radius: 0.5em;
-moz-border-radius: 0.5em;
+ border-radius: 0.5em;
border: 2px solid #BBB;
color:#000;
display: block;
@@ -192,6 +195,7 @@ body > h1 {
border: 1px solid #ccd;
-webkit-border-radius: 0.8em;
-moz-border-radius: 0.8em;
+ border-radius: 0.8em;
width: 12.5em;
margin: 4px 0 0 24px;
}
@@ -235,6 +239,7 @@ body > h1 {
-webkit-border-top-left-radius: 0.5em;
-webkit-border-bottom-left-radius: 0.5em;
-moz-border-radius: 0.5em 0 0 0.5em;
+ border-radius: 0.5em 0 0 0.5em;
text-transform: none;
font-size: medium;
color: #667;
diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt
index 9f2ad38..b574aac 100644
--- a/themes/asf/index.html.xslt
+++ b/themes/asf/index.html.xslt
@@ -25,7 +25,7 @@
title="{atom:title}" type="{atom:link[@rel='self']/@type}" />
</xsl:if>
<link rel="shortcut icon" href="/favicon.ico" />
- <script type="text/javascript" src="personalize.js">
+ <script defer="defer" src="personalize.js">
<xsl:comment><!--HTML Compatibility--></xsl:comment>
</script>
</head>
@@ -193,9 +193,14 @@
<!-- entry title -->
<xsl:text>&#10;</xsl:text>
<h3>
- <xsl:if test="atom:source/atom:icon">
- <img src="{atom:source/atom:icon}" class="icon"/>
- </xsl:if>
+ <xsl:choose>
+ <xsl:when test="atom:source/atom:icon">
+ <img src="{atom:source/atom:icon}" class="icon"/>
+ </xsl:when>
+ <xsl:when test="atom:source/planet:favicon">
+ <img src="{atom:source/planet:favicon}" class="icon"/>
+ </xsl:when>
+ </xsl:choose>
<a>
<xsl:if test="atom:source/atom:link[@rel='alternate']/@href">
<xsl:attribute name="href">
diff --git a/themes/asf/personalize.js b/themes/asf/personalize.js
index d044b87..ff122ea 100644
--- a/themes/asf/personalize.js
+++ b/themes/asf/personalize.js
@@ -20,7 +20,7 @@ function prevArticle(event) {
var scrollTop = document.documentElement.scrollTop || document.body.scrollTop;
for (var i=entries.length; --i>=0;) {
if (!entries[i].anchor) continue;
- if (entries[i].anchor.offsetTop < scrollTop) {
+ if (entries[i].anchor.offsetTop+20 < scrollTop) {
window.location.hash=entries[i].anchor.id;
stopPropagation(event);
break;
@@ -294,4 +294,8 @@ function personalize() {
}
// hook event
-document.addEventListener("DOMContentLoaded", personalize, false);
+if (document.getElementById('footer')) {
+ personalize();
+} else {
+ document.addEventListener("DOMContentLoaded", personalize, false);
+}
diff --git a/themes/classic_fancy/index.html.tmpl b/themes/classic_fancy/index.html.tmpl
index 3ade246..3419d28 100644
--- a/themes/classic_fancy/index.html.tmpl
+++ b/themes/classic_fancy/index.html.tmpl
@@ -112,11 +112,11 @@ Powered by:<br>
<h2>Planetarium:</h2>
<ul>
<li><a href="http://www.planetapache.org/">Planet Apache</a></li>
-<li><a href="http://planet.debian.net/">Planet Debian</a></li>
<li><a href="http://planet.freedesktop.org/">Planet freedesktop.org</a></li>
<li><a href="http://planet.gnome.org/">Planet GNOME</a></li>
-<li><a href="http://planetsun.org/">Planet Sun</a></li>
-<li><a href="http://fedora.linux.duke.edu/fedorapeople/">Fedora People</a></li>
+<li><a href="http://planet.debian.net/">Planet Debian</a></li>
+<li><a href="http://planet.fedoraproject.org/">Planet Fedora</a></li>
+<li><a href="http://planets.sun.com/">Planet Sun</a></li>
<li><a href="http://www.planetplanet.org/">more...</a></li>
</ul>
</p>
diff --git a/themes/common/admin.html.tmpl b/themes/common/admin.html.tmpl
new file mode 100644
index 0000000..672706e
--- /dev/null
+++ b/themes/common/admin.html.tmpl
@@ -0,0 +1,41 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+ <head>
+ <title><TMPL_VAR name> administration</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <meta name="generator" content="<TMPL_VAR generator ESCAPE="HTML">">
+ </head>
+
+ <body>
+ <h2>Maintenance tasks</h2>
+ <form action="admin_cb.py" method="GET">
+ <input type="hidden" name="command" value="refresh" />
+ <input type="submit" value="Refresh planet" />
+ </form><br />
+ <form action="admin_cb.py" method="GET">
+ <input type="hidden" name="command" value="run" />
+ <input type="submit" value="Run planet" />
+ </form><br />
+ <form action="admin_cb.py" method="GET">
+ <input type="hidden" name="command" value="expunge" />
+ <input type="submit" value="Expunge planet" />
+ </form>
+
+ <h2>Blacklist</h2>
+ <div>
+ <form action="admin_cb.py" method="GET">
+ <input type="hidden" name="command" value="blacklist" />
+ <TMPL_LOOP Items>
+ <input type="checkbox" value="<TMPL_VAR id ESCAPE="HTML">" name="bl<TMPL_VAR __PASS__>">
+ <a target="_blank" href="<TMPL_VAR channel_link ESCAPE="HTML">">
+ <TMPL_VAR channel_name>
+ </a>: <a target="_blank" href="<TMPL_VAR link ESCAPE="HTML">"><TMPL_VAR title></a></input>
+ <br />
+ </TMPL_LOOP>
+ <br />
+ <input type="submit" value="Blacklist" />
+ </form>
+ </div>
+ </body>
+
+</html>
diff --git a/themes/common/rss10.xml.tmpl b/themes/common/rss10.xml.tmpl
index cdaaa79..750bf4f 100644
--- a/themes/common/rss10.xml.tmpl
+++ b/themes/common/rss10.xml.tmpl
@@ -4,12 +4,19 @@
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
+ xmlns:atom="http://www.w3.org/2005/Atom"
xmlns="http://purl.org/rss/1.0/"
>
<channel rdf:about="<TMPL_VAR link ESCAPE="HTML">">
<title><TMPL_VAR name ESCAPE="HTML"></title>
<link><TMPL_VAR link ESCAPE="HTML"></link>
<description><TMPL_VAR name ESCAPE="HTML"> - <TMPL_VAR link ESCAPE="HTML"></description>
+ <TMPL_IF pubsubhubbub_hub>
+ <atom:link rel="hub" href="<TMPL_VAR pubsubhubbub_hub ESCAPE="HTML">"/>
+ </TMPL_IF>
+ <TMPL_IF fullurl>
+ <atom:link rel="self" href="<TMPL_VAR fullurl ESCAPE="HTML">" type="application/rss+xml"/>
+ </TMPL_IF>
<items>
<rdf:Seq>
diff --git a/themes/common/rss20.xml.tmpl b/themes/common/rss20.xml.tmpl
index 724a104..217051c 100644
--- a/themes/common/rss20.xml.tmpl
+++ b/themes/common/rss20.xml.tmpl
@@ -1,11 +1,17 @@
<?xml version="1.0"?>
-<rss version="2.0">
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title><TMPL_VAR name></title>
<link><TMPL_VAR link ESCAPE="HTML"></link>
<language>en</language>
<description><TMPL_VAR name ESCAPE="HTML"> - <TMPL_VAR link ESCAPE="HTML"></description>
+ <TMPL_IF pubsubhubbub_hub>
+ <atom:link rel="hub" href="<TMPL_VAR pubsubhubbub_hub ESCAPE="HTML">"/>
+ </TMPL_IF>
+ <TMPL_IF fullurl>
+ <atom:link rel="self" href="<TMPL_VAR fullurl ESCAPE="HTML">" type="application/rss+xml"/>
+ </TMPL_IF>
<TMPL_LOOP Items>
<item>
diff --git a/themes/django/index.html.dj b/themes/django/index.html.dj
index 632a527..637e60d 100644
--- a/themes/django/index.html.dj
+++ b/themes/django/index.html.dj
@@ -21,7 +21,7 @@
<ul>
{% for channel in Channels %}
- <li>{{ channel.title }} by {{ channel.author_name }}</li>
+ <li>{{ channel.title|safe }} by {{ channel.author_name }}</li>
{% endfor %}
</ul>
</div>
@@ -32,9 +32,9 @@
{% endifchanged %}
<div class="entry">
- {% if item.title %}<h4>{{ item.title }}</h4>{% endif %}
+ {% if item.title %}<h4>{{ item.title|safe }}</h4>{% endif %}
- {{ item.content }}
+ {{ item.content|safe }}
<p class="entry-tools">
by {{ item.channel_author }} on
diff --git a/themes/genshi_fancy/index.html.genshi b/themes/genshi_fancy/index.html.genshi
index fe26934..85e858a 100644
--- a/themes/genshi_fancy/index.html.genshi
+++ b/themes/genshi_fancy/index.html.genshi
@@ -81,11 +81,11 @@ Powered by:<br/>
<h2>Planetarium:</h2>
<ul>
<li><a href="http://www.planetapache.org/">Planet Apache</a></li>
-<li><a href="http://planet.debian.net/">Planet Debian</a></li>
<li><a href="http://planet.freedesktop.org/">Planet freedesktop.org</a></li>
<li><a href="http://planet.gnome.org/">Planet GNOME</a></li>
-<li><a href="http://planetsun.org/">Planet Sun</a></li>
-<li><a href="http://fedora.linux.duke.edu/fedorapeople/">Fedora People</a></li>
+<li><a href="http://planet.debian.net/">Planet Debian</a></li>
+<li><a href="http://planet.fedoraproject.org/">Planet Fedora</a></li>
+<li><a href="http://planets.sun.com/">Planet Sun</a></li>
<li><a href="http://www.planetplanet.org/">more...</a></li>
</ul>
</p>