summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuillaume Seguin <guillaume@segu.in>2011-05-02 15:10:11 +0200
committerGuillaume Seguin <guillaume@segu.in>2011-05-02 15:11:17 +0200
commit36802e6cd6491c1c9e69f383af56d9059a51e1e5 (patch)
tree19c218f5010460e7af45de84241d7f28a6026901
parent5926815bad45f2cf331df39cdff82a7c85f611a0 (diff)
parent9de21094a8cf565bdfcf75688e121a5ad1f5397b (diff)
downloadplanetpixels-36802e6cd6491c1c9e69f383af56d9059a51e1e5.tar.gz
planetpixels-36802e6cd6491c1c9e69f383af56d9059a51e1e5.tar.bz2
Merge remote branch 'venus/master'HEADmaster
Conflicts: .gitignore planet/shell/dj.py planet/vendor/feedparser.py
-rw-r--r--.bzrignore4
-rwxr-xr-xadmin_cb.py141
-rw-r--r--docs/admin.html78
-rw-r--r--docs/config.html13
-rw-r--r--docs/contributing.html34
-rw-r--r--docs/index.html1
-rw-r--r--docs/templates.html6
-rw-r--r--favicon.py79
-rw-r--r--filters/mememe.plugin18
-rw-r--r--filters/minhead.py2
-rwxr-xr-xplanet.py31
-rw-r--r--planet/__init__.py4
-rw-r--r--planet/config.py29
-rwxr-xr-xplanet/csv_config.py3
-rw-r--r--planet/publish.py26
-rw-r--r--planet/reconstitute.py35
-rw-r--r--planet/scrub.py25
-rw-r--r--planet/shell/__init__.py2
-rw-r--r--planet/shell/_genshi.py16
-rw-r--r--planet/shell/dj.py4
-rw-r--r--planet/shell/tmpl.py3
-rw-r--r--planet/shell/xslt.py2
-rw-r--r--planet/spider.py20
-rw-r--r--planet/splice.py26
-rwxr-xr-xplanet/vendor/feedparser.py25
-rw-r--r--planet/vendor/html5lib/__init__.py7
-rw-r--r--planet/vendor/html5lib/constants.py53
-rw-r--r--planet/vendor/html5lib/html5parser.py565
-rw-r--r--planet/vendor/html5lib/ihatexml.py51
-rw-r--r--planet/vendor/html5lib/inputstream.py97
-rw-r--r--planet/vendor/html5lib/sanitizer.py148
-rw-r--r--planet/vendor/html5lib/serializer/htmlserializer.py68
-rw-r--r--planet/vendor/html5lib/tokenizer.py743
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/__init__.py17
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/_base.py50
-rw-r--r--planet/vendor/html5lib/treebuilders/dom.py46
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/etree.py11
-rw-r--r--planet/vendor/html5lib/treebuilders/etree_lxml.py32
-rwxr-xr-xplanet/vendor/html5lib/treebuilders/simpletree.py34
-rw-r--r--planet/vendor/html5lib/treebuilders/soup.py19
-rw-r--r--planet/vendor/html5lib/treewalkers/_base.py18
-rw-r--r--planet/vendor/html5lib/treewalkers/dom.py1
-rw-r--r--planet/vendor/html5lib/treewalkers/genshistream.py6
-rw-r--r--planet/vendor/html5lib/treewalkers/lxmletree.py6
-rw-r--r--planet/vendor/html5lib/treewalkers/soup.py9
-rw-r--r--planet/vendor/html5lib/utils.py21
-rw-r--r--planet/vendor/httplib2/__init__.py3
-rw-r--r--planet/vendor/pubsubhubbub_publisher/PKG-INFO10
-rw-r--r--planet/vendor/pubsubhubbub_publisher/__init__.py2
-rw-r--r--planet/vendor/pubsubhubbub_publisher/pubsubhubbub_publish.py77
-rwxr-xr-xpublish.py17
-rw-r--r--tests/data/config/basic.csv2
-rw-r--r--tests/data/config/basic.ini1
-rw-r--r--tests/data/filter/django/test.xml2
-rw-r--r--tests/data/reconstitute/content_illegal_char.xml2
-rw-r--r--tests/data/reconstitute/dc_date_taken.xml12
-rw-r--r--tests/data/reconstitute/georss_box_latlong.xml10
-rw-r--r--tests/data/reconstitute/georss_placeboxpolygon_latlong.xml15
-rw-r--r--tests/data/reconstitute/georss_point_latlong.xml11
-rw-r--r--tests/data/reconstitute/georss_polygon_latlong.xml10
-rw-r--r--tests/data/reconstitute/gr_id.xml11
-rw-r--r--tests/data/reconstitute/rss_source.xml2
-rw-r--r--tests/data/spider/config.ini1
-rw-r--r--tests/reconstitute.py3
-rw-r--r--tests/test_config.py3
-rw-r--r--tests/test_docs.py15
-rw-r--r--tests/test_filter_django.py12
-rw-r--r--tests/test_reconstitute.py3
-rw-r--r--tests/test_spider.py21
-rw-r--r--themes/asf/default.css5
-rw-r--r--themes/asf/index.html.xslt13
-rw-r--r--themes/asf/personalize.js8
-rw-r--r--themes/classic_fancy/index.html.tmpl6
-rw-r--r--themes/common/admin.html.tmpl41
-rw-r--r--themes/common/rss10.xml.tmpl7
-rw-r--r--themes/common/rss20.xml.tmpl8
-rw-r--r--themes/django/index.html.dj6
-rw-r--r--themes/genshi_fancy/index.html.genshi6
78 files changed, 2197 insertions, 777 deletions
diff --git a/.bzrignore b/.bzrignore
deleted file mode 100644
index a8f0629..0000000
--- a/.bzrignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*.tmplc
-.DS_Store
-cache
-*.pluginc
diff --git a/admin_cb.py b/admin_cb.py
new file mode 100755
index 0000000..63315e1
--- /dev/null
+++ b/admin_cb.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import cgi
+import cgitb
+cgitb.enable()
+
+from urllib import unquote
+import sys, os
+
+# Modify this to point to where you usually run planet.
+BASE_DIR = '..'
+
+# Modify this to point to your venus installation dir, relative to planet dir above.
+VENUS_INSTALL = "venus"
+
+# Config file, relative to planet dir above
+CONFIG_FILE = "config/live"
+
+# Admin page URL, relative to this script's URL
+ADMIN_URL = "admin.html"
+
+
+# chdir to planet dir - config may be relative from there
+os.chdir(os.path.abspath(BASE_DIR))
+
+# Add venus to path.
+sys.path.append(VENUS_INSTALL)
+
+# Add shell dir to path - auto detection does not work
+sys.path.append(os.path.join(VENUS_INSTALL, "planet", "shell"))
+
+# import necessary planet items
+from planet import config
+from planet.spider import filename
+
+
+# Load config
+config.load(CONFIG_FILE)
+
+# parse query parameters
+form = cgi.FieldStorage()
+
+
+# Start HTML output at once
+print "Content-Type: text/html;charset=utf-8" # HTML is following
+print # blank line, end of headers
+
+
+print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
+print '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sv"><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>Admin results</title></head><body>'
+print '<div>'
+
+# Cache and blacklist dirs
+
+cache = config.cache_directory()
+blacklist = config.cache_blacklist_directory()
+
+# Must have command parameter
+if not "command" in form:
+ print "<p>Unknown command</p>"
+
+elif form['command'].value == "blacklist":
+
+
+ # Create the blacklist dir if it does not exist
+ if not os.path.exists(blacklist):
+ os.mkdir(blacklist)
+ print "<p>Created directory %s</p>" % blacklist
+
+ # find list of urls, in the form bl[n]=url
+
+ for key in form.keys():
+
+ if not key.startswith("bl"): continue
+
+ url = unquote(form[key].value)
+
+ # find corresponding files
+ cache_file = filename(cache, url)
+ blacklist_file = filename(blacklist, url)
+
+ # move to blacklist if found
+ if os.path.exists(cache_file):
+
+ os.rename(cache_file, blacklist_file)
+
+ print "<p>Blacklisted <a href='%s'>%s</a></p>" % (url, url)
+
+ else:
+
+ print "<p>Unknown file: %s</p>" % cache_file
+
+ print """
+<p>Note that blacklisting does not automatically
+refresh the planet. You will need to either wait for
+a scheduled planet run, or refresh manually from the admin interface.</p>
+"""
+
+
+elif form['command'].value == "run":
+
+ # run spider and refresh
+
+ from planet import spider, splice
+ try:
+ spider.spiderPlanet(only_if_new=False)
+ print "<p>Successfully ran spider</p>"
+ except Exception, e:
+ print e
+
+ doc = splice.splice()
+ splice.apply(doc.toxml('utf-8'))
+
+elif form['command'].value == "refresh":
+
+ # only refresh
+
+ from planet import splice
+
+ doc = splice.splice()
+ splice.apply(doc.toxml('utf-8'))
+
+ print "<p>Successfully refreshed</p>"
+
+elif form['command'].value == "expunge":
+
+ # only expunge
+ from planet import expunge
+ expunge.expungeCache()
+
+ print "<p>Successfully expunged</p>"
+
+
+
+
+print "<p><strong><a href='" + ADMIN_URL + "'>Return</a> to admin interface</strong></p>"
+
+
+
+print "</body></html>"
diff --git a/docs/admin.html b/docs/admin.html
new file mode 100644
index 0000000..811bd60
--- /dev/null
+++ b/docs/admin.html
@@ -0,0 +1,78 @@
+<!DOCTYPE html PUBLIC
+ "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"
+ "http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<script type="text/javascript" src="docs.js"></script>
+<link rel="stylesheet" type="text/css" href="docs.css"/>
+<title>Administration interface</title>
+</head>
+<body>
+<h2>Administration interface</h2>
+<p>Venus comes with a basic administration interface, allowing you to manually run planet, do a refresh from cache, expunge the cache or blacklist individual entries from the planet.</p>
+
+<h3>Using the administration interface</h3>
+
+<p>The administration interface allows you to manage the everyday tasks related to your venus installation.</p>
+
+<ul><li><strong>Running planet</strong>. By clicking the "Run planet" button, you can do a full run of the planet script, rechecking all the feeds and recreating the generated files. This corresponds to running <code>python planet.py config.ini</code> with no arguments. Note that, depending on the numer of feeds, this operation may take some time.</li>
+<li><strong>Refreshing planet</strong>. By clicking the "Refresh planet" button, you can do an "offline" run of the planet script, without rechecking all the feeds but still recreating the generated files. This corresponds to running <code>python planet.py -o config.ini</code>.</li>
+<li><strong>Expunging the planet cache</strong>. By clicking the "Expunge cache" button, you can clean the cache from outdated entries. This corresponds to running <code>python planet.py -x config.ini</code>.</li>
+<li><strong>Blacklisting</strong>. By selecting one or more of the entries in the list of entries, and clicking the "Blacklist" button, you can stop these items from displaying on the planet. This is very useful for quickly blocking inappropriate or malformed content from your planet. <i>Note that blacklisting does not take effect until you refresh or rerun the planet</i>. (Blacklisting can also be done manually on the server by moving files from the cache directory to the blacklist directory.)</li>
+</ul>
+
+<p>Installing the administration interface securely requires some knowledge of web server configuration.</p>
+
+<p>The admin interface consists of two parts: the admin template file and the server callback script. Both must be correctly installed for the administration interface to work.</p>
+
+<h3>Installing the admin template</h3>
+
+The admin page template is found in <code>themes/common/admin.html.tmpl</code>. This template needs to be added to your config file along with your other templates, and optionally customized. Make sure that <code>action="admin_cb.py"</code> found in several places in the file points to the URL (or relative URL) of the admin callback script below.
+
+<h3>Installing the admin callback script</h3>
+
+<p>The admin callback script, admin_cb.py, needs to be copied to somewhere among your web server files. Depending on the details of your web server, your permissions, etc., this can be done in several different ways and in different places. There are three steps involved:</p>
+<ol><li>Configuring the script</li>
+<li>Enabling CGI</li>
+<li>Secure access</li></ol>
+
+
+<h4>Configuring the script</h4>
+
+<p>At the top of the script, there are four variables you must customize. The correct values of the first three variables can be found by analyzing how you normally run the <code>planet.py</code> script. If you typically run planet from within the working directory <code>BASE_DIR</code>, using a command like <blockquote><code>python [VENUS_INSTALL]/planet.py [CONFIG_FILE]</code></blockquote> you know all three values.</p>
+
+<dl><dt><code>BASE_DIR</code></dt><dd>
+This variable must contain the directory from where you usually run the planet.py script, to ensure that relative file names in the config files work correctly.</dd>
+<dt><code>VENUS_INSTALL</code></dt><dd>
+This variable must contain your venus installation directory, relative to BASE_DIR above.</dd>
+<dt><code>CONFIG_FILE</code></dt><dd>
+This variable must contain your configuration file, relative to BASE_DIR above.</dd>
+<dt><code>ADMIN_URL</code></dt><dd>
+This variable must contain the URL (or relative URL) of the administration page, relative to this script's URL.</dd>
+</dl>
+
+<h4>Enabling CGI</h4>
+
+<p>You will need to ensure that it can be run as a CGI script. This is done differently on different web server platforms, but there are at least three common patterns</p>
+
+<ul><li><b>Apache with <code>.htaccess</code></b>. If your server allows you to use <code>.htaccess</code> files, you can simply add
+<blockquote><code>Options +ExecCGI<br />
+AddHandler cgi-script .py</code></blockquote>
+in an .htaccess file in the planet output directory to enable the server to run the script. In this case, the admin_cb.py file can be put alongside the rest of the planet output files.
+</li>
+<li><b>Apache without <code>.htaccess</code></b>. If your server does not allow you to add CGI handlers to <code>.htaccess</code> files, you can add
+<blockquote><code>Options +ExecCGI<br />
+AddHandler cgi-script .py</code></blockquote>
+to the relevant part of the central apache configuration files.
+</li>
+<li><b>Apache with cgi-bin</b>. If your server only allow CGI handlers in pre-defined directories, you can place the <code>admin_cb.py</code> file there, and make sure to update the <code>action="admin_cb.py"</code> code in the template file <code>admin.html.tmpl</code>, as well as the <code>ADMIN_URL</code> in the callback script.
+</li>
+</ul>
+
+<p>In all cases, it is necessary to make sure that the script is executed as the same user that owns the planet output files and the cache. Either the planet output is owned by the apache user (usually <code>www-data</code>), or Apache's <a href="http://httpd.apache.org/docs/2.0/suexec.html">suexec</a> feature can be used to run the script as the right user.</p>
+
+<h4>Securing the admin interface</h4>
+<p>If you don't want every user to be able to administrate your planet, you must secure at least the <code>admin_cb.py</code> file, and preferrably the <code>admin.html</code> file as well. This can be done using your web server's regular access control features. See <a href="http://httpd.apache.org/docs/2.0/howto/auth.html">here</a> for Apache documentation.</p>
+
+</body>
+</html>
diff --git a/docs/config.html b/docs/config.html
index 0ed6e59..ee6cf45 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -118,6 +118,19 @@ cache. If specified as a relative path, it is evaluated relative to the
<dd>Used by <code>expunge</code> to determine how many entries should be
kept for each source when expunging old entries from the cache directory.
This may be overriden on a per subscription feed basis.</dd>
+<dt><ins>pubsubhubbub_hub</ins></dt>
+<dd>URL to a PubSubHubbub hub, for example <a
+href="http://pubsubhubbub.appspot.com">http://pubsubhubbub.appspot.com</a>.
+Used by <code>publish</code> to ping the
+hub when feeds are published, speeding delivery of updates to
+subscribers. See
+the <a href="http://code.google.com/p/pubsubhubbub/"> PubSubHubbub
+home page</a> for more information.</dd>
+<dt><ins>pubsubhubbub_feeds</ins></dt>
+<dd>List of feeds to publish. Defaults to <code>atom.xml rss10.xml
+rss20.xml</code>.</dd>
+<dt id="django_autoescape"><ins>django_autoescape</ins></dt>
+<dd>Control <a href="http://docs.djangoproject.com/en/dev/ref/templates/builtins/#autoescape">autoescaping</a> behavior of django templates. Defaults to <code>on</code>.</dd>
</dl>
<p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
diff --git a/docs/contributing.html b/docs/contributing.html
index 2cf95e1..42e8835 100644
--- a/docs/contributing.html
+++ b/docs/contributing.html
@@ -1,6 +1,4 @@
-<!DOCTYPE html PUBLIC
- "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"
- "http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd">
+<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<script type="text/javascript" src="docs.js"></script>
@@ -22,46 +20,46 @@ contribution.</p>
<p>Documentation can be found in the <code>docs</code> directory. It is
straight XHTML.</p>
<p>Test cases can be found in the
-<a href="http://localhost/~rubys/venus/tests/">tests</a> directory, and
+<a href="http://intertwingly.net/code/venus/tests/">tests</a> directory, and
make use of the
<a href="http://docs.python.org/lib/module-unittest.html">Python Unit testing framework</a>. To run them, simply enter:</p>
<blockquote><pre>python runtests.py</pre></blockquote>
-<h3>Bzr</h3>
-<p>If you have done a <a href="index.html">bzr get</a>, you have already set up
+<h3>Git</h3>
+<p>If you have done a <a href="index.html">git pull</a>, you have already set up
a repository. The only additional step you might need to do is to introduce
-yourself to <a href="http://bazaar-vcs.org/">bzr</a>. Type in the following,
+yourself to <a href="http://git-scm.com/">git</a>. Type in the following,
after replacing the <b>bold text</b> with your information:</p>
-<blockquote><pre>bzr whoami '<b>Your Name</b> &lt;<b>youremail</b>@<b>example.com</b>&gt;'</pre></blockquote>
+<blockquote><pre>git config --global user.name '<b>Your Name</b>'
+git config --global user.email '<b>youremail</b>@<b>example.com</b>'</pre></blockquote>
<p>Then, simply make the changes you like. When you are done, type:</p>
-<blockquote><pre>bzr st</pre></blockquote>
+<blockquote><pre>git status</pre></blockquote>
<p>This will tell you which files you have modified, and which ones you may
have added. If you add files and you want them to be included, simply do a:</p>
-<blockquote><pre>bzr add file1 file2...</pre></blockquote>
+<blockquote><pre>git add file1 file2...</pre></blockquote>
-<p>You can also do a <code>bzr diff</code> to see if there are any changes
+<p>You can also do a <code>git diff</code> to see if there are any changes
which you made that you don't want included. I can't tell you how many
debug print statements I have caught this way.</p>
<p>Next, type:</p>
-<blockquote><pre>bzr commit</pre></blockquote>
+<blockquote><pre>git commit -a</pre></blockquote>
<p>This will allow you to enter a comment describing your change. If your
repository is already on your web server, simple let others know where they
-can find it. If not, you can simply ftp or scp the files to your web server
-&mdash; no additional software needs to be installed on that machine.</p>
+can find it. If not, consider using <a href="">github</a> to host your
+<a href="http://help.github.com/forking/">fork</a> of Venus.</p>
<h3>Telling others</h3>
<p>Once you have a change worth sharing, post a message on the
-<a href="http://lists.planetplanet.org/mailman/listinfo/devel">mailing list</a>.</p>
-<p>Also, consider setting up a <a href="http://bzr.mfd-consult.dk/bzr-feed/">bzr-feed</a> for your repository, so people who wish to do so can automatically
-be notified of every change.</p>
-<p>There now is even an nascent <a href="http://planet.intertwingly.net/venus/">planet</a> being formed which combines these feeds of changes. You can <a href="http://planet.intertwingly.net/venus/atom.xml">subscribe</a> to it too.</p>
+<a href="http://lists.planetplanet.org/mailman/listinfo/devel">mailing
+list</a>, or use github to send a <a
+href="http://github.com/guides/pull-requests">pull request</a>.</p>
</body>
</html>
diff --git a/docs/index.html b/docs/index.html
index c461d7f..051eb64 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -22,6 +22,7 @@
<li><a href="venus.svg">Architecture</a></li>
<li><a href="normalization.html">Normalization</a></li>
<li><a href="filters.html">Filters and Plugins</a></li>
+<li><a href="admin.html">Administration interface</a></li>
</ul>
</li>
<li>Other
diff --git a/docs/templates.html b/docs/templates.html
index b9fd9c1..5549901 100644
--- a/docs/templates.html
+++ b/docs/templates.html
@@ -143,6 +143,12 @@ Item.</p>
requires at least Python 2.3.
</p>
+<p>
+ The <a href="config.html#django_autoescape">django_autoescape</a> config
+ option may be used to globally set the default value for
+ <a href="http://docs.djangoproject.com/en/dev/ref/templates/builtins/#autoescape">auto-escaping</a>.
+</p>
+
<h3>xslt</h3>
<p><a href="http://www.w3.org/TR/xslt">XSLT</a> is a paradox: it actually
makes some simple things easier to do than htmltmpl, and certainly can
diff --git a/favicon.py b/favicon.py
new file mode 100644
index 0000000..2e351a3
--- /dev/null
+++ b/favicon.py
@@ -0,0 +1,79 @@
+import sys, socket
+from planet import config, feedparser
+from planet.spider import filename
+from urllib2 import urlopen
+from urlparse import urljoin
+from html5lib import html5parser, treebuilders
+from ConfigParser import ConfigParser
+
+# load config files (default: config.ini)
+for arg in sys.argv[1:]:
+ config.load(arg)
+if len(sys.argv) == 1:
+ config.load('config.ini')
+
+from Queue import Queue
+from threading import Thread
+
+# determine which subscriptions have no icon but do have a html page
+fetch_queue = Queue()
+html = ['text/html', 'application/xhtml+xml']
+sources = config.cache_sources_directory()
+for sub in config.subscriptions():
+ data=feedparser.parse(filename(sources,sub))
+ if data.feed.get('icon'): continue
+ if not data.feed.get('links'): continue
+ for link in data.feed.links:
+ if link.rel=='alternate' and link.type in html:
+ fetch_queue.put((sub, link.href))
+ break
+
+# find the favicon for a given webpage
+def favicon(page):
+ parser=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc=parser.parse(urlopen(page))
+ favicon = urljoin(page, '/favicon.ico')
+ for link in doc.getElementsByTagName('link'):
+ if link.hasAttribute('rel') and link.hasAttribute('href'):
+ if 'icon' in link.attributes['rel'].value.lower().split(' '):
+ favicon = urljoin(page, link.attributes['href'].value)
+ if urlopen(favicon).info()['content-length'] != '0':
+ return favicon
+
+# thread worker that fills in the dictionary which maps subs to favicon
+icons = {}
+def fetch(thread_index, fetch_queue, icons):
+ while 1:
+ sub, html = fetch_queue.get()
+ if not html: break
+ try:
+ icon = favicon(html)
+ if icon: icons[sub] = icon
+ except:
+ pass
+
+# set timeout
+try:
+ socket.setdefaulttimeout(float(config.feed_timeout()))
+except:
+ pass
+
+# (optionally) spawn threads, fetch pages
+threads = {}
+if int(config.spider_threads()):
+ for i in range(int(config.spider_threads())):
+ threads[i] = Thread(target=fetch, args=(i, fetch_queue, icons))
+ fetch_queue.put((None, None))
+ threads[i].start()
+ for i in range(int(config.spider_threads())):
+ threads[i].join()
+else:
+ fetch_queue.put((None, None))
+ fetch(0, fetch_queue, icons)
+
+# produce config file
+config = ConfigParser()
+for sub, icon in icons.items():
+ config.add_section(sub)
+ config.set(sub, 'favicon', icon)
+config.write(sys.stdout)
diff --git a/filters/mememe.plugin b/filters/mememe.plugin
index 273c02c..8073a27 100644
--- a/filters/mememe.plugin
+++ b/filters/mememe.plugin
@@ -18,10 +18,15 @@
# sidebar = @class='sidebar'
#
-import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
+import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re
from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
+try:
+ from hashlib import md5
+except:
+ from md5 import new as md5
+
import planet
from planet import config
from planet.spider import filename
@@ -209,6 +214,7 @@ class html(sgmllib.SGMLParser):
self.feedurl = ""
self.intitle = False
+ url = url.split('#')[0]
headers = check_cache(url)
try:
@@ -380,6 +386,7 @@ from urllib import quote_plus
for i in range(0,len(weighted_links)):
weight, link, updated = weighted_links[i]
if link in spam: continue
+ if unique_votes(all_links[link]) < 2: continue
# ensure that somebody new points to this entry. This guards against
# groups of related links which several posts point to all.
@@ -405,6 +412,7 @@ for i in range(0,len(weighted_links)):
# otherwise, parse the html
if not title:
title = html(revmap.get(link,link)).title
+ if not title: title = link.strip('/').split('/')[-1]
# dehtmlize
title = re.sub('&(\w+);',
@@ -413,7 +421,7 @@ for i in range(0,len(weighted_links)):
title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
# title too long? Insert zero width spaces where appropriate
- if max(map(len,title.split())) > 30:
+ if len(title.strip())>0 and max(map(len,title.split())) > 30:
title=re.sub('(\W+)',u'\\1\u200b',title)
# save the entry title (it is used later)
@@ -467,7 +475,7 @@ for i in range(0,len(weighted_links)):
tagbase = config.link().split('/')
if not tagbase[-1]: tagbase = tagbase[:-1]
tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
- entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
+ entry.newTextChild(None, 'id', tagbase % md5(link).hexdigest())
entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
meme_link = entry.newTextChild(None, 'link', None)
meme_link.setProp('href', link)
@@ -488,6 +496,10 @@ for i in range(0,len(weighted_links)):
count = count + 1
if count >= 10: break
+# remove ul when there are no memes
+if memes_ul.lsCountNode() < 1:
+ memes_ul.unlinkNode()
+
log.info("Writing " + MEMES_ATOM)
output=open(MEMES_ATOM,'w')
output.write(feed_doc.serialize('utf-8'))
diff --git a/filters/minhead.py b/filters/minhead.py
index b9c225e..056481b 100644
--- a/filters/minhead.py
+++ b/filters/minhead.py
@@ -28,7 +28,7 @@ if first < minhead:
for i in range(6,0,-1):
for oldhead in doc.getElementsByTagName('h%d' % i):
newhead = doc.createElementNS(XHTML_NAMESPACE, 'h%d' % (i+minhead-first))
- for child in oldhead.childNodes:
+ for child in oldhead.childNodes[:]:
newhead.appendChild(child)
oldhead.parentNode.replaceChild(newhead, oldhead)
diff --git a/planet.py b/planet.py
index c278c06..26191bb 100755
--- a/planet.py
+++ b/planet.py
@@ -17,11 +17,13 @@ __license__ = "Python"
import os, sys
if __name__ == "__main__":
- config_file = "config.ini"
+ config_file = []
offline = 0
verbose = 0
only_if_new = 0
expunge = 0
+ debug_splice = 0
+ no_publish = 0
for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help":
@@ -33,6 +35,7 @@ if __name__ == "__main__":
print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds"
print " -x, --expunge Expunge old entries from cache"
+ print " --no-publish Do not publish feeds using PubSubHubbub"
print
sys.exit(0)
elif arg == "-v" or arg == "--verbose":
@@ -43,14 +46,18 @@ if __name__ == "__main__":
only_if_new = 1
elif arg == "-x" or arg == "--expunge":
expunge = 1
+ elif arg == "-d" or arg == "--debug-splice":
+ debug_splice = 1
+ elif arg == "--no-publish":
+ no_publish = 1
elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg
sys.exit(1)
else:
- config_file = arg
+ config_file.append(arg)
from planet import config
- config.load(config_file)
+ config.load(config_file or 'config.ini')
if verbose:
import planet
@@ -65,8 +72,26 @@ if __name__ == "__main__":
from planet import splice
doc = splice.splice()
+
+ if debug_splice:
+ from planet import logger
+ logger.info('writing debug.atom')
+ debug=open('debug.atom','w')
+ try:
+ from lxml import etree
+ from StringIO import StringIO
+ tree = etree.tostring(etree.parse(StringIO(doc.toxml())))
+ debug.write(etree.tostring(tree, pretty_print=True))
+ except:
+ debug.write(doc.toprettyxml(indent=' ', encoding='utf-8'))
+ debug.close
+
splice.apply(doc.toxml('utf-8'))
+ if config.pubsubhubbub_hub() and not no_publish:
+ from planet import publish
+ publish.publish(config)
+
if expunge:
from planet import expunge
expunge.expungeCache
diff --git a/planet/__init__.py b/planet/__init__.py
index f90dfe9..61c2cb1 100644
--- a/planet/__init__.py
+++ b/planet/__init__.py
@@ -36,5 +36,7 @@ sys.path.insert(1, os.path.join(os.path.dirname(__file__),'vendor'))
# Configure feed parser
import feedparser
-feedparser.SANITIZE_HTML=0
+feedparser.SANITIZE_HTML=1
feedparser.RESOLVE_RELATIVE_URIS=0
+
+import publish
diff --git a/planet/config.py b/planet/config.py
index ba9821c..5295e62 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -105,6 +105,8 @@ def __init__():
define_planet('output_theme', '')
define_planet('output_dir', 'output')
define_planet('spider_threads', 0)
+ define_planet('pubsubhubbub_hub', '')
+ define_planet_list('pubsubhubbub_feeds', 'atom.xml rss10.xml rss20.xml')
define_planet_int('new_feed_items', 0)
define_planet_int('feed_timeout', 20)
@@ -116,6 +118,7 @@ def __init__():
define_planet_list('bill_of_materials')
define_planet_list('template_directories', '.')
define_planet_list('filter_directories')
+ define_planet('django_autoescape', 'on')
# template options
define_tmpl_int('days_per_page', 0)
@@ -134,11 +137,11 @@ def __init__():
define_tmpl('filter', None)
define_tmpl('exclude', None)
-def load(config_file):
+def load(config_files):
""" initialize and load a configuration"""
global parser
parser = ConfigParser()
- parser.read(config_file)
+ parser.read(config_files)
import config, planet
from planet import opml, foaf, csv_config
@@ -157,8 +160,11 @@ def load(config_file):
dirs = config.template_directories()
if theme_dir not in dirs:
dirs.append(theme_dir)
- if os.path.dirname(config_file) not in dirs:
- dirs.append(os.path.dirname(config_file))
+ if not hasattr(config_files, 'append'):
+ config_files = [config_files]
+ for config_file in config_files:
+ if os.path.dirname(config_file) not in dirs:
+ dirs.append(os.path.dirname(config_file))
# read in the theme
parser = ConfigParser()
@@ -172,7 +178,7 @@ def load(config_file):
# merge configurations, allowing current one to override theme
template_files = config.template_files()
parser.set('Planet','template_files','')
- parser.read(config_file)
+ parser.read(config_files)
for file in config.bill_of_materials():
if not file in bom: bom.append(file)
parser.set('Planet', 'bill_of_materials', ' '.join(bom))
@@ -306,7 +312,7 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
def http_cache_directory():
if parser.has_option('Planet', 'http_cache_directory'):
- os.path.join(cache_directory(),
+ return os.path.join(cache_directory(),
parser.get('Planet', 'http_cache_directory'))
else:
return os.path.join(cache_directory(), "cache")
@@ -318,9 +324,16 @@ def cache_sources_directory():
else:
return os.path.join(cache_directory(), 'sources')
+def cache_blacklist_directory():
+ if parser.has_option('Planet', 'cache_blacklist_directory'):
+ return os.path.join(cache_directory(),
+ parser.get('Planet', 'cache_blacklist_directory'))
+ else:
+ return os.path.join(cache_directory(), 'blacklist')
+
def cache_lists_directory():
if parser.has_option('Planet', 'cache_lists_directory'):
- parser.get('Planet', 'cache_lists_directory')
+ return parser.get('Planet', 'cache_lists_directory')
else:
return os.path.join(cache_directory(), 'lists')
@@ -335,7 +348,7 @@ def feed():
def feedtype():
if parser.has_option('Planet', 'feedtype'):
- parser.get('Planet', 'feedtype')
+ return parser.get('Planet', 'feedtype')
elif feed() and feed().find('atom')>=0:
return 'atom'
elif feed() and feed().find('rss')>=0:
diff --git a/planet/csv_config.py b/planet/csv_config.py
index ba3be61..9f905a6 100755
--- a/planet/csv_config.py
+++ b/planet/csv_config.py
@@ -13,7 +13,8 @@ def csv2config(input, config=None):
reader = csv.DictReader(input)
for row in reader:
section = row[reader.fieldnames[0]]
- config.add_section(section)
+ if not config.has_section(section):
+ config.add_section(section)
for name, value in row.items():
if value and name != reader.fieldnames[0]:
config.set(section, name, value)
diff --git a/planet/publish.py b/planet/publish.py
new file mode 100644
index 0000000..36df866
--- /dev/null
+++ b/planet/publish.py
@@ -0,0 +1,26 @@
+import os, sys
+import urlparse
+import planet
+import pubsubhubbub_publisher as PuSH
+
+def publish(config):
+ log = planet.logger
+ hub = config.pubsubhubbub_hub()
+ link = config.link()
+
+ # identify feeds
+ feeds = []
+ if hub and link:
+ for root, dirs, files in os.walk(config.output_dir()):
+ for file in files:
+ if file in config.pubsubhubbub_feeds():
+ feeds.append(urlparse.urljoin(link, file))
+
+ # publish feeds
+ if feeds:
+ try:
+ PuSH.publish(hub, feeds)
+ for feed in feeds:
+ log.info("Published %s to %s\n" % (feed, hub))
+ except PuSH.PublishError, e:
+ log.error("PubSubHubbub publishing error: %s\n" % e)
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index f5e910d..e2a69eb 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -25,7 +25,7 @@ try:
except:
from md5 import new as md5
-illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
+illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text"""
@@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8')
except:
value=value.decode('iso-8859-1')
+ value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument
xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value))
@@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
def invalidate(c):
""" replace invalid characters """
- return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
+ return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:]
def ncr2c(value):
@@ -69,6 +70,7 @@ def id(xentry, entry):
if entry.has_key("id") and entry.id:
entry_id = entry.id
+ if hasattr(entry_id, 'values'): entry_id = entry_id.values()[0]
elif entry.has_key("link") and entry.link:
entry_id = entry.link
elif entry.has_key("title") and entry.title:
@@ -102,6 +104,8 @@ def links(xentry, entry):
xlink.setAttribute('type', link.get('type'))
if link.has_key('rel'):
xlink.setAttribute('rel', link.get('rel',None))
+ if link.has_key('title'):
+ xlink.setAttribute('title', link.get('title'))
if link.has_key('length'):
xlink.setAttribute('length', link.get('length'))
xentry.appendChild(xlink)
@@ -177,6 +181,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
+ if illegal_xml_chars.search(data.data):
+ data = xdoc.createTextNode(
+ illegal_xml_chars.sub(invalidate, data.data))
else:
data = div
xcontent.setAttribute('type', 'xhtml')
@@ -225,6 +232,10 @@ def source(xsource, source, bozo, format):
for contributor in source.get('contributors',[]):
author(xsource, 'contributor', contributor)
+ if not source.has_key('links') and source.has_key('href'): #rss
+ source['links'] = [{ 'href': source.get('href') }]
+ if source.has_key('title'):
+ source['links'][0]['title'] = source.get('title')
links(xsource, source)
content(xsource, 'rights', source.get('rights_detail',None), bozo)
@@ -273,6 +284,11 @@ def reconstitute(feed, entry):
date(xentry, 'updated', entry_updated(feed.feed, entry, time.gmtime()))
date(xentry, 'published', entry.get('published_parsed',None))
+ if entry.has_key('dc_date.taken'):
+ date_Taken = createTextElement(xentry, '%s:%s' % ('dc','date_Taken'), '%s' % entry.get('dc_date.taken', None))
+ date_Taken.setAttribute('xmlns:%s' % 'dc', 'http://purl.org/dc/elements/1.1/')
+ xentry.appendChild(date_Taken)
+
for tag in entry.get('tags',[]):
category(xentry, tag)
@@ -298,6 +314,21 @@ def reconstitute(feed, entry):
if entry.has_key('geo_lat') and \
entry.has_key('geo_long'):
location(xentry, (float)(entry.get('geo_long',None)), (float)(entry.get('geo_lat',None)))
+ if entry.has_key('georss_point'):
+ coordinates = re.split('[,\s]', entry.get('georss_point'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
+ elif entry.has_key('georss_line'):
+ coordinates = re.split('[,\s]', entry.get('georss_line'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
+ elif entry.has_key('georss_circle'):
+ coordinates = re.split('[,\s]', entry.get('georss_circle'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
+ elif entry.has_key('georss_box'):
+ coordinates = re.split('[,\s]', entry.get('georss_box'))
+ location(xentry, ((float)(coordinates[1])+(float)(coordinates[3]))/2, ((float)(coordinates[0])+(float)(coordinates[2]))/2)
+ elif entry.has_key('georss_polygon'):
+ coordinates = re.split('[,\s]', entry.get('georss_polygon'))
+ location(xentry, (float)(coordinates[1]), (float)(coordinates[0]))
# author / contributor
author_detail = entry.get('author_detail',{})
diff --git a/planet/scrub.py b/planet/scrub.py
index 586edde..fef5c22 100644
--- a/planet/scrub.py
+++ b/planet/scrub.py
@@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
- # Run this through HTML5's serializer
- from html5lib import html5parser, sanitizer, treebuilders
+ # Run this through HTML5's sanitizer
+ doc = None
+ if 'xhtml' in node['type']:
+ try:
+ from xml.dom import minidom
+ doc = minidom.parseString(node['value'])
+ except:
+ node['type']='text/html'
+
+ if not doc:
+ from html5lib import html5parser, treebuilders
+ p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+ doc = p.parseFragment(node['value'], encoding='utf-8')
+
from html5lib import treewalkers, serializer
- p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
- tree=treebuilders.getTreeBuilder('dom'))
- doc = p.parseFragment(node.value, encoding='utf-8')
+ from html5lib.filters import sanitizer
+ walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
- walker = treewalkers.getTreeWalker('dom')
- tree = xhtml.serialize(walker(doc), encoding='utf-8')
+ tree = xhtml.serialize(walker, encoding='utf-8')
+
node['value'] = ''.join([str(token) for token in tree])
diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py
index 49b8557..a65b121 100644
--- a/planet/shell/__init__.py
+++ b/planet/shell/__init__.py
@@ -33,7 +33,7 @@ def run(template_file, doc, mode='template'):
log.info(" %s", os.path.realpath(template_dir))
logged_modes.append(mode)
return
- template_resolved = os.path.realpath(template_resolved)
+ template_resolved = os.path.abspath(template_resolved)
# Add shell directory to the path, if not already there
shellpath = os.path.join(sys.path[0],'planet','shell')
diff --git a/planet/shell/_genshi.py b/planet/shell/_genshi.py
index 5dffab2..7880a3a 100644
--- a/planet/shell/_genshi.py
+++ b/planet/shell/_genshi.py
@@ -3,7 +3,9 @@ from xml.sax.saxutils import escape
from genshi.input import HTMLParser, XMLParser
from genshi.template import Context, MarkupTemplate
+import planet
+log = planet.logger
subscriptions = []
feed_types = [
'application/atom+xml',
@@ -28,13 +30,15 @@ def find_config(config, feed):
if link.has_key('type') and link.type in feed_types:
if link.has_key('href') and link.href in subscriptions:
return norm(dict(config.parser.items(link.href)))
-
+
# match based on name
- for sub in subscriptions:
- if config.parser.has_option(sub, 'name') and \
- norm(config.parser.get(sub, 'name')) == feed.planet_name:
- return norm(dict(config.parser.items(sub)))
+ if 'planet_name' in feed:
+ for sub in subscriptions:
+ if config.parser.has_option(sub, 'name') and \
+ norm(config.parser.get(sub, 'name')) == feed.planet_name:
+ return norm(dict(config.parser.items(sub)))
+ log.warning('Could not match subscription to config: %s', feed.link)
return {}
class XHTMLParser(object):
@@ -68,7 +72,7 @@ def run(script, doc, output_file=None, options={}):
context = Context(**options)
tmpl_fileobj = open(script)
- tmpl = MarkupTemplate(tmpl_fileobj, script)
+ tmpl = MarkupTemplate(tmpl_fileobj, script, lookup="lenient")
tmpl_fileobj.close()
if not output_file:
diff --git a/planet/shell/dj.py b/planet/shell/dj.py
index c8a54a9..d2199fc 100644
--- a/planet/shell/dj.py
+++ b/planet/shell/dj.py
@@ -19,7 +19,7 @@ def run(script, doc, output_file=None, options={}):
# I need to re-import the settings at every call because I have to
# set the TEMPLATE_DIRS variable programmatically
from django.conf import settings
- settings._wrapped = None
+ settings._wrapped=None
try:
settings.configure(
DEBUG=True, TEMPLATE_DEBUG=True,
@@ -32,7 +32,7 @@ def run(script, doc, output_file=None, options={}):
# set up the Django context by using the default htmltmpl
# datatype converters
- context = Context()
+ context = Context(autoescape=(config.django_autoescape()=='on'))
context.update(tmpl.template_info(doc))
context['Config'] = config.planet_options()
t = get_template(script)
diff --git a/planet/shell/tmpl.py b/planet/shell/tmpl.py
index 3c8cb6b..b0f238f 100644
--- a/planet/shell/tmpl.py
+++ b/planet/shell/tmpl.py
@@ -231,6 +231,7 @@ def template_info(source):
output['link'] = config.link()
output['owner_name'] = config.owner_name()
output['owner_email'] = config.owner_email()
+ output['pubsubhubbub_hub'] = config.pubsubhubbub_hub()
if config.feed():
output['feed'] = config.feed()
output['feedtype'] = config.feed().find('rss')>=0 and 'rss' or 'atom'
@@ -267,8 +268,10 @@ def run(script, doc, output_file=None, options={}):
tp.set(key, value)
if output_file:
+ basename = os.path.basename(output_file)
reluri = os.path.splitext(os.path.basename(output_file))[0]
tp.set('url', urlparse.urljoin(config.link(),reluri))
+ tp.set('fullurl', urlparse.urljoin(config.link(),basename))
output = open(output_file, "w")
output.write(tp.process(template))
diff --git a/planet/shell/xslt.py b/planet/shell/xslt.py
index 0b6579f..24173ea 100644
--- a/planet/shell/xslt.py
+++ b/planet/shell/xslt.py
@@ -8,7 +8,7 @@ def quote(string, apos):
if string.find("'")<0:
return "'" + string + "'"
- elif string.find("'")<0:
+ elif string.find('"')<0:
return '"' + string + '"'
else:
# unclear how to quote strings with both types of quotes for libxslt
diff --git a/planet/spider.py b/planet/spider.py
index 59afcb6..50d1739 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -69,6 +69,7 @@ def _is_http_uri(uri):
def writeCache(feed_uri, feed_info, data):
log = planet.logger
sources = config.cache_sources_directory()
+ blacklist = config.cache_blacklist_directory()
# capture http status
if not data.has_key("status"):
@@ -125,7 +126,7 @@ def writeCache(feed_uri, feed_info, data):
log.info("Updating feed %s", feed_uri)
# if read failed, retain cached information
- if not data.version and feed_info.version:
+ if not data.get('version') and feed_info.get('version'):
data.feed = feed_info.feed
data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
data.version = feed_info.feed.get('planet_format')
@@ -147,7 +148,7 @@ def writeCache(feed_uri, feed_info, data):
data.feed['planet_content_hash'] = data.headers['-content-hash']
# capture feed and data from the planet configuration file
- if data.version:
+ if data.get('version'):
if not data.feed.has_key('links'): data.feed['links'] = list()
feedtype = 'application/atom+xml'
if data.version.startswith('rss'): feedtype = 'application/rss+xml'
@@ -175,7 +176,9 @@ def writeCache(feed_uri, feed_info, data):
# generate an id, if none is present
if not entry.has_key('id') or not entry.id:
entry['id'] = reconstitute.id(None, entry)
- if not entry['id']: continue
+ elif hasattr(entry['id'], 'values'):
+ entry['id'] = entry['id'].values()[0]
+ if not entry['id']: continue
# determine updated date for purposes of selection
updated = ''
@@ -190,6 +193,13 @@ def writeCache(feed_uri, feed_info, data):
cache = config.cache_directory()
for updated, entry in ids.values():
+ # compute blacklist file name based on the id
+ blacklist_file = filename(blacklist, entry.id)
+
+ # check if blacklist file exists. If so, skip it.
+ if os.path.exists(blacklist_file):
+ continue
+
# compute cache file name based on the id
cache_file = filename(cache, entry.id)
@@ -420,8 +430,6 @@ def spiderPlanet(only_if_new = False):
# Process the results as they arrive
feeds_seen = {}
while fetch_queue.qsize() or parse_queue.qsize() or threads:
- while parse_queue.qsize() == 0 and threads:
- time.sleep(0.1)
while parse_queue.qsize():
(uri, feed_info, feed) = parse_queue.get(False)
try:
@@ -479,6 +487,8 @@ def spiderPlanet(only_if_new = False):
traceback.format_tb(tb)):
log.error(line.rstrip())
+ time.sleep(0.1)
+
for index in threads.keys():
if not threads[index].isAlive():
del threads[index]
diff --git a/planet/splice.py b/planet/splice.py
index f751975..b399eca 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -44,6 +44,12 @@ def splice():
link.setAttribute('type', "application/%s+xml" % config.feedtype())
feed.appendChild(link)
+ if config.pubsubhubbub_hub():
+ hub = doc.createElement('link')
+ hub.setAttribute('rel', 'hub')
+ hub.setAttribute('href', config.pubsubhubbub_hub())
+ feed.appendChild(hub)
+
if config.link():
link = doc.createElement('link')
link.setAttribute('rel', 'alternate')
@@ -58,6 +64,21 @@ def splice():
data=feedparser.parse(filename(sources,sub))
if data.feed.has_key('id'): sub_ids.append(data.feed.id)
if not data.feed: continue
+
+ # warn on missing links
+ if not data.feed.has_key('planet_message'):
+ if not data.feed.has_key('links'): data.feed['links'] = []
+
+ for link in data.feed.links:
+ if link.rel == 'self': break
+ else:
+ log.debug('missing self link for ' + sub)
+
+ for link in data.feed.links:
+ if link.rel == 'alternate' and 'html' in link.type: break
+ else:
+ log.debug('missing html link for ' + sub)
+
xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
reconstitute.source(xdoc.documentElement, data.feed, None, None)
@@ -68,6 +89,7 @@ def splice():
# insert entry information
items = 0
count = {}
+ atomNS='http://www.w3.org/2005/Atom'
new_feed_items = config.new_feed_items()
for mtime,file in dir:
if index != None:
@@ -81,7 +103,7 @@ def splice():
# number of entries contributed by this feed does not exceed
# config.new_feed_items
entry.normalize()
- sources = entry.getElementsByTagName('source')
+ sources = entry.getElementsByTagNameNS(atomNS, 'source')
if sources:
ids = sources[0].getElementsByTagName('id')
if ids:
@@ -93,6 +115,8 @@ def splice():
ids = sources[0].getElementsByTagName('planet:id')
if not ids: continue
id = ids[0].childNodes[0].nodeValue
+ if id not in sub_ids:
+ log.warn('Skipping: ' + id)
if id not in sub_ids: continue
# add entry to feed
diff --git a/planet/vendor/feedparser.py b/planet/vendor/feedparser.py
index 6518126..76167ce 100755
--- a/planet/vendor/feedparser.py
+++ b/planet/vendor/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
-__version__ = "4.2-pre-" + "$Revision$"[11:14] + "-svn"
+__version__ = "4.2-pre-" + "$Revision: 314 $"[11:14] + "-svn"
__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@@ -1131,7 +1131,7 @@ class _FeedParserMixin:
def _getContext(self):
if self.insource:
context = self.sourcedata
- elif self.inimage:
+ elif self.inimage and self.feeddata.has_key('image'):
context = self.feeddata['image']
elif self.intextinput:
context = self.feeddata['textinput']
@@ -1595,9 +1595,12 @@ if _XML_AVAILABLE:
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
+ self.decls = {}
def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
+ if uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
@@ -1622,7 +1625,7 @@ if _XML_AVAILABLE:
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD = {}
+ attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
@@ -1679,8 +1682,11 @@ if _XML_AVAILABLE:
class _BaseHTMLProcessor(sgmllib.SGMLParser):
special = re.compile('''[<>'"]''')
bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
+ elements_no_end_tag = [
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ]
def __init__(self, encoding, type):
self.encoding = encoding
@@ -2461,6 +2467,15 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
+ # add implicit namespaces to html5 inline svg/mathml
+ if self.type.endswith('html'):
+ if tag=='svg':
+ if not dict(attrs).get('xmlns'):
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ if not dict(attrs).get('xmlns'):
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
# not otherwise acceptable, perhaps it is MathML or SVG?
if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
self.mathmlOK += 1
diff --git a/planet/vendor/html5lib/__init__.py b/planet/vendor/html5lib/__init__.py
index 7a20994..ae64f14 100644
--- a/planet/vendor/html5lib/__init__.py
+++ b/planet/vendor/html5lib/__init__.py
@@ -8,9 +8,10 @@ Example usage:
import html5lib
f = open("my_document.html")
-p = html5lib.HTMLParser()
-tree = p.parse(f)
+tree = html5lib.parse(f)
"""
-from html5parser import HTMLParser, parse
+__version__ = "%(version)s"
+from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
+from treewalkers import getTreeWalker
from serializer import serialize
diff --git a/planet/vendor/html5lib/constants.py b/planet/vendor/html5lib/constants.py
index c9f5883..f9521c8 100644
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@@ -180,6 +180,8 @@ E = {
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
+ "unexpected-form-in-table":
+ _(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
@@ -256,21 +258,18 @@ E = {
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
+ "eof-in-script-in-script":
+ _(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
+ "unexpected-end-tag-before-html":
+ _(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
-contentModelFlags = {
- "PCDATA":0,
- "RCDATA":1,
- "CDATA":2,
- "PLAINTEXT":3
-}
-
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
@@ -509,6 +508,8 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
+xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
+
entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
@@ -878,6 +879,44 @@ entities = {
"zwnj;": u"\u200C"
}
+replacementCharacters = {
+ 0x0:u"\uFFFD",
+ 0x0d:u"\u000A",
+ 0x80:u"\u20AC",
+ 0x81:u"\u0081",
+ 0x81:u"\u0081",
+ 0x82:u"\u201A",
+ 0x83:u"\u0192",
+ 0x84:u"\u201E",
+ 0x85:u"\u2026",
+ 0x86:u"\u2020",
+ 0x87:u"\u2021",
+ 0x88:u"\u02C6",
+ 0x89:u"\u2030",
+ 0x8A:u"\u0160",
+ 0x8B:u"\u2039",
+ 0x8C:u"\u0152",
+ 0x8D:u"\u008D",
+ 0x8E:u"\u017D",
+ 0x8F:u"\u008F",
+ 0x90:u"\u0090",
+ 0x91:u"\u2018",
+ 0x92:u"\u2019",
+ 0x93:u"\u201C",
+ 0x94:u"\u201D",
+ 0x95:u"\u2022",
+ 0x96:u"\u2013",
+ 0x97:u"\u2014",
+ 0x98:u"\u02DC",
+ 0x99:u"\u2122",
+ 0x9A:u"\u0161",
+ 0x9B:u"\u203A",
+ 0x9C:u"\u0153",
+ 0x9D:u"\u009D",
+ 0x9E:u"\u017E",
+ 0x9F:u"\u0178",
+}
+
encodings = {
'437': 'cp437',
'850': 'cp850',
diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
index a8e5a1f..5ff742a 100644
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -4,6 +4,29 @@ except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
+
+try:
+ any
+except:
+ # Implement 'any' for python 2.4 and previous
+ def any(iterable):
+ for element in iterable:
+ if element:
+ return True
+ return False
+
+try:
+ "abc".startswith(("a", "b"))
+ def startswithany(str, prefixes):
+ return str.startswith(prefixes)
+except:
+ # Python 2.4 doesn't accept a tuple as argument to string startswith
+ def startswithany(str, prefixes):
+ for prefix in prefixes:
+ if str.startswith(prefix):
+ return True
+ return False
+
import sys
import inputstream
@@ -14,7 +37,7 @@ from treebuilders._base import Marker
from treebuilders import simpletree
import utils
-from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
+from constants import spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
@@ -26,6 +49,12 @@ def parse(doc, treebuilder="simpletree", encoding=None,
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding)
+def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
+ namespaceHTMLElements=True):
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parseFragment(doc, container=container, encoding=encoding)
+
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
@@ -60,7 +89,7 @@ class HTMLParser(object):
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
"afterHead": AfterHeadPhase(self, self.tree),
"inBody": InBodyPhase(self, self.tree),
- "inCDataRCData": InCDataRCDataPhase(self, self.tree),
+ "text": TextPhase(self, self.tree),
"inTable": InTablePhase(self, self.tree),
"inTableText": InTableTextPhase(self, self.tree),
"inCaption": InCaptionPhase(self, self.tree),
@@ -107,14 +136,14 @@ class HTMLParser(object):
self.innerHTML = self.container.lower()
if self.innerHTML in cdataElements:
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+ self.tokenizer.state = self.tokenizer.rcdataState
elif self.innerHTML in rcdataElements:
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+ self.tokenizer.state = self.tokenizer.rawtextState
elif self.innerHTML == 'plaintext':
- self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+ self.tokenizer.state = self.tokenizer.plaintextState
else:
- # contentModelFlag already is PCDATA
- #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+ # state already is data state
+ # self.tokenizer.state = self.tokenizer.dataState
pass
self.phase = self.phases["beforeHtml"]
self.phase.insertHtmlElement()
@@ -152,8 +181,6 @@ class HTMLParser(object):
for token in self.normalizedTokens():
- #print self.phase.__class__.__name__
- #print token
type = token["type"]
if type == CharactersToken:
self.phase.processCharacters(token)
@@ -376,18 +403,22 @@ class HTMLParser(object):
self.phase = self.phases["inBody"]
break
- def parseRCDataCData(self, token, contentType):
- """Generic (R)CDATA Parsing algorithm
- contentType - RCDATA or CDATA
+ def parseRCDataRawtext(self, token, contentType):
+ """Generic RCDATA/RAWTEXT Parsing algorithm
+ contentType - RCDATA or RAWTEXT
"""
- assert contentType in ("CDATA", "RCDATA")
+ assert contentType in ("RAWTEXT", "RCDATA")
element = self.tree.insertElement(token)
- self.tokenizer.contentModelFlag = contentModelFlags[contentType]
+
+ if contentType == "RAWTEXT":
+ self.tokenizer.state = self.tokenizer.rawtextState
+ else:
+ self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
- self.phase = self.phases["inCDataRCData"]
+ self.phase = self.phases["text"]
class Phase(object):
"""Base class for helper object that implements each phase of processing
@@ -441,34 +472,24 @@ class Phase(object):
self.endTagHandler[token["name"]](token)
class InitialPhase(Phase):
- # This phase deals with error handling as well which is currently not
- # covered in the specification. The error handling is typically known as
- # "quirks mode". It is expected that a future version of HTML5 will defin
- # this.
- def processEOF(self):
- self.parser.parseError("expected-doctype-but-got-eof")
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
- self.parser.phase.processEOF()
-
+ def processSpaceCharacters(self, token):
+ pass
+
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
-
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId != None or
- systemId != None):
+ systemId != None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
- if systemId is None:
- systemId = ""
self.tree.insertDoctype(token)
@@ -476,117 +497,108 @@ class InitialPhase(Phase):
publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html"
- or publicId in
- ("+//silmaril//dtd html pro v0r11 19970101//en",
- "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
- "-//as//dtd html 3.0 aswedit + extensions//en",
- "-//ietf//dtd html 2.0 level 1//en",
- "-//ietf//dtd html 2.0 level 2//en",
- "-//ietf//dtd html 2.0 strict level 1//en",
- "-//ietf//dtd html 2.0 strict level 2//en",
- "-//ietf//dtd html 2.0 strict//en",
- "-//ietf//dtd html 2.0//en",
- "-//ietf//dtd html 2.1e//en",
- "-//ietf//dtd html 3.0//en",
- "-//ietf//dtd html 3.0//en//",
- "-//ietf//dtd html 3.2 final//en",
- "-//ietf//dtd html 3.2//en",
- "-//ietf//dtd html 3//en",
- "-//ietf//dtd html level 0//en",
- "-//ietf//dtd html level 0//en//2.0",
- "-//ietf//dtd html level 1//en",
- "-//ietf//dtd html level 1//en//2.0",
- "-//ietf//dtd html level 2//en",
- "-//ietf//dtd html level 2//en//2.0",
- "-//ietf//dtd html level 3//en",
- "-//ietf//dtd html level 3//en//3.0",
- "-//ietf//dtd html strict level 0//en",
- "-//ietf//dtd html strict level 0//en//2.0",
- "-//ietf//dtd html strict level 1//en",
- "-//ietf//dtd html strict level 1//en//2.0",
- "-//ietf//dtd html strict level 2//en",
- "-//ietf//dtd html strict level 2//en//2.0",
- "-//ietf//dtd html strict level 3//en",
- "-//ietf//dtd html strict level 3//en//3.0",
- "-//ietf//dtd html strict//en",
- "-//ietf//dtd html strict//en//2.0",
- "-//ietf//dtd html strict//en//3.0",
- "-//ietf//dtd html//en",
- "-//ietf//dtd html//en//2.0",
- "-//ietf//dtd html//en//3.0",
- "-//metrius//dtd metrius presentational//en",
- "-//microsoft//dtd internet explorer 2.0 html strict//en",
- "-//microsoft//dtd internet explorer 2.0 html//en",
- "-//microsoft//dtd internet explorer 2.0 tables//en",
- "-//microsoft//dtd internet explorer 3.0 html strict//en",
- "-//microsoft//dtd internet explorer 3.0 html//en",
- "-//microsoft//dtd internet explorer 3.0 tables//en",
- "-//netscape comm. corp.//dtd html//en",
- "-//netscape comm. corp.//dtd strict html//en",
- "-//o'reilly and associates//dtd html 2.0//en",
- "-//o'reilly and associates//dtd html extended 1.0//en",
- "-//o'reilly and associates//dtd html extended relaxed 1.0//en",
- "-//spyglass//dtd html 2.0 extended//en",
- "-//sq//dtd html 2.0 hotmetal + extensions//en",
- "-//sun microsystems corp.//dtd hotjava html//en",
- "-//sun microsystems corp.//dtd hotjava strict html//en",
- "-//w3c//dtd html 3 1995-03-24//en",
- "-//w3c//dtd html 3.2 draft//en",
- "-//w3c//dtd html 3.2 final//en",
- "-//w3c//dtd html 3.2//en",
- "-//w3c//dtd html 3.2s draft//en",
- "-//w3c//dtd html 4.0 frameset//en",
- "-//w3c//dtd html 4.0 transitional//en",
- "-//w3c//dtd html experimental 19960712//en",
- "-//w3c//dtd html experimental 970421//en",
- "-//w3c//dtd w3 html//en",
- "-//w3o//dtd w3 html 3.0//en",
- "-//w3o//dtd w3 html 3.0//en//",
- "-//w3o//dtd w3 html strict 3.0//en//",
- "-//webtechs//dtd mozilla html 2.0//en",
- "-//webtechs//dtd mozilla html//en",
- "-/w3c/dtd html 4.0 transitional/en",
- "html")
- or (publicId in
- ("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)
- or (systemId != None and
- systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
+ or startswithany(publicId,
+ ("+//silmaril//dtd html pro v0r11 19970101//",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//"))
+ or publicId in
+ ("-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html")
+ or startswithany(publicId,
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId == None
+ or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
- elif (publicId in
- ("-//w3c//dtd xhtml 1.0 frameset//EN",
- "-//w3c//dtd xhtml 1.0 transitional//EN")
- or (publicId in
- ("-//w3c//dtd html 4.01 frameset//EN",
- "-//w3c//dtd html 4.01 transitional//EN") and
- systemId == None)):
+ elif (startswithany(publicId,
+ ("-//w3c//dtd xhtml 1.0 frameset//",
+ "-//w3c//dtd xhtml 1.0 transitional//"))
+ or startswithany(publicId,
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId != None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
-
- def processSpaceCharacters(self, token):
- pass
+
+ def anythingElse(self):
+ self.parser.compatMode = "quirks"
+ self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processCharacters(token)
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processStartTag(token)
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
- self.parser.compatMode = "quirks"
- self.parser.phase = self.parser.phases["beforeHtml"]
+ self.anythingElse()
self.parser.phase.processEndTag(token)
+
+ def processEOF(self):
+ self.parser.parseError("expected-doctype-but-got-eof")
+ self.anythingElse()
+ self.parser.phase.processEOF()
class BeforeHtmlPhase(Phase):
@@ -617,8 +629,12 @@ class BeforeHtmlPhase(Phase):
self.parser.phase.processStartTag(token)
def processEndTag(self, token):
- self.insertHtmlElement()
- self.parser.phase.processEndTag(token)
+ if token["name"] not in ("head", "body", "html", "br"):
+ self.parser.parseError("unexpected-end-tag-before-html",
+ {"name": token["name"]})
+ else:
+ self.insertHtmlElement()
+ self.parser.phase.processEndTag(token)
class BeforeHeadPhase(Phase):
@@ -632,7 +648,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
- (("head", "br"), self.endTagImplyHead)
+ (("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@@ -647,6 +663,9 @@ class BeforeHeadPhase(Phase):
self.startTagHead(impliedTagToken("head", "StartTag"))
self.parser.phase.processCharacters(token)
+ def startTagHtml(self, token):
+ self.parser.phases["inBody"].processStartTag(token)
+
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
@@ -673,8 +692,8 @@ class InHeadPhase(Phase):
("title", self.startTagTitle),
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
("script", self.startTagScript),
- (("base", "link", "command", "eventsource"),
- self.startTagBaseLinkCommandEventsource),
+ (("base", "link", "command"),
+ self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
@@ -709,7 +728,7 @@ class InHeadPhase(Phase):
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
- def startTagBaseLinkCommandEventsource(self, token):
+ def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
@@ -724,23 +743,27 @@ class InHeadPhase(Phase):
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif "content" in attributes:
- data = inputstream.EncodingBytes(
- attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
+ # Encoding it as UTF-8 here is a hack, as really we should pass
+ # the abstract Unicode string, and just use the
+ # ContentAttrParser on that, but using UTF-8 allows all chars
+ # to be encoded and as a ASCII-superset works.
+ data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
- self.parser.parseRCDataCData(token, "RCDATA")
+ self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoScriptNoFramesStyle(self, token):
#Need to decide whether to implement the scripting-disabled case
- self.parser.parseRCDataCData(token, "CDATA")
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagScript(self, token):
- #I think this is equivalent to the CDATA stuff since we don't execute script
- #self.tree.insertElement(token)
- self.parser.parseRCDataCData(token, "CDATA")
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+ self.parser.originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
@@ -819,7 +842,6 @@ class AfterHeadPhase(Phase):
self.parser.phase.processStartTag(token)
def endTagHtmlBodyBr(self, token):
- #This is not currently in the spec
self.anythingElse()
self.parser.phase.processEndTag(token)
@@ -833,8 +855,8 @@ class AfterHeadPhase(Phase):
class InBodyPhase(Phase):
- # http://www.whatwg.org/specs/web-apps/current-work/#in-body
- # the crazy mode
+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+ # the really-really-really-very crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
@@ -843,15 +865,16 @@ class InBodyPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
- (("base", "link", "meta", "script", "style", "title"),
- self.startTagProcessInHead),
+ (("base", "command", "link", "meta", "noframes", "script", "style",
+ "title"), self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "datagrid",
- "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
- "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "listing",
- "menu", "nav", "ol", "p", "pre", "section", "ul"),
+ "details", "dir", "div", "dl", "fieldset", "figure",
+ "footer", "header", "hgroup", "menu", "nav", "ol", "p",
+ "section", "ul"),
self.startTagCloseP),
+ (("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "d