This module renders an ATOM feed.
It's possible for any page in the Arcology now to define an #+ARCOLOGY_FEED
keyword, and in doing so create
a new route in the Arcology Public
Router which will render an Atom feed. The semantics of the feed
more-or-less follow the expectations defined in ox-rss
: Any heading with an ID
property and a PUBDATE
property with an org-mode active
timestamp in it will be published to the feed. Any entry with an ID will
have a PUBDATE
added to it by invoking
(org-rss-add-pubdate-property)
.
Invoking Pandoc for the Feed Generator
To get an ATOM feed for an org document, it's easy enough to invoke
render_feed_from_file
I'm shelling out to pandoc
directly.
Probably shouldn't have reached for that thing in the first place! oh
well.
import re
from fastapi import Response, HTTPException
import asyncio
from sqlmodel import Session
from async_lru import alru_cache
from typing import Optional
from arcology.html import rewrite_html
from arcology.key import ArcologyKey
from arcology.parse import parse_sexp, print_sexp
from arcology.arroyo import Page
This is pretty straightforward, except I stick an LRU cache in the middle of it so that feed readers aren't constantly invoking Pandoc.
class ExportException(BaseException):
int
code: str
stderr:
def __init__(self, code, stderr=None):
self.code = code
self.stderr = stderr
@alru_cache(maxsize=64)
async def export_pandoc(file: str, hash: str) -> str:
= await asyncio.create_subprocess_shell(
proc f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE)
stderr= await proc.communicate()
stdout, stderr if proc.returncode == 0:
return stdout.decode()
else:
raise ExportException(code=proc.returncode, stderr=stderr.decode())
async def render_feed_from_file(_request, file: str, engine, site) -> Optional[Response]:
with Session(engine) as session:
= Page.from_file(file, session)
p if p is None:
raise HTTPException(status_code=404, detail="Feed not found.")
try:
= await export_pandoc(file, p.hash)
xml except ExportException as e:
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
return hydrate_feed(file, xml, session)
The feed is more-or-less ready as-is when it comes out of Pandoc
except for the final "canonical" URL – an re.sub
invocation will replace it a stand-in
variable with the correct URL. I could probably inject this in to the
Pandoc invocation as a metadata variable but this is Good Enough.
def hydrate_feed(filename: str, xml: str, session) -> str:
= Page.from_file(filename, session)
page def feed_page_replacement_fn(match):
return page.get_file()
= ArcologyKey(page.get_key())
akey
= xml
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
out_xml = rewrite_html(out_xml, session) # lol dangerous
out_xml return out_xml
Rendering Atom from Org in Pandoc in two steps
I had some real trouble figuring out how to get Pandoc to spit out ATOM feeds and this is not "technically compliant" but I can do it with a combination of a Lua filter which extracts headings' metadata in to variables which a custom template then renders out:
Lua Filter
local utils = require 'pandoc.utils'
local entries = {}
local keywords = {}
local variables = {}
set_meta_from_raw = function (raw)
-- Don't do anything unless the block contains *org* markup.
if raw.format ~= 'org' then return nil end
-- extract variable name and value
local name, value = raw.text:match '#%+(%w+):%s*(.+)$'
if name and value then
variables[name] = value
end
end
-- thanks random github users https://gist.github.com/zwh8800/9b0442efadc97408ffff248bc8573064
local epoch = os.time{year=1970, month=1, day=1, hour=0}
function parse_org_date(org_date)
local year, month, day, hour, minute = org_date:match("<?(%d+)%-(%d+)%-(%d+)%s%a+%s(%d+)%:(%d+)>?")
local timestamp = os.time{year = year, month = month, day = day, hour = hour, min = minute, sec = 0} - epoch
return timestamp
end
rfc3339ish = "%Y-%m-%dT%TZ"
set_entries_and_date = function(m)
for name, value in pairs(variables) do
m[name] = value
end
if m["FILETAGS"] ~= nil then
kw_str = utils.stringify(m["FILETAGS"])
kw_str:gsub("([^:]*)", function(tag)
if tag ~= "" then
table.insert(keywords, tag)
end
end)
end
if m["keywords"] ~= nil then
kw_str = utils.stringify(m["keywords"])
kw_str:gsub("([^, ]*)", function(tag)
if tag ~= "" then
table.insert(keywords, tag)
end
end)
end
if m.date == nil then
m.date = os.date(rfc3339ish) --current time in iso8601/rfc3339
end
m.entries = entries
m.keywords = keywords
return m
end
extract_entries = function (blocks)
pandoc.utils.make_sections(true, nil, blocks):walk
{
Div = function (div)
if div.attributes.pubdate then
local header = div.content[1]
header.attributes.pubdate = nil
header.attributes.number = nil
div.attributes.number = nil
header_tags = {}
title_subbed = utils.stringify(header.content)
for k,v in pairs(header.content) do
if v.tag == "Span" and v.attributes["tag-name"] ~= nil then
table.insert(header_tags, v.attributes["tag-name"])
end
end
entry = {
content=div,
title=title_subbed,
rel=div.attributes.id,
pubdate=os.date(rfc3339ish, parse_org_date(div.attributes.pubdate))
}
if #header_tags > 0 then
entry["categories"] = header_tags
end
table.insert(entries, entry)
end
end
}
end
return {
{
RawBlock = set_meta_from_raw,
Blocks = extract_entries,
Meta = set_entries_and_date
}
}
Pandoc Template
The template is basically unremarkable, but has the same issues that the HTML files have: they need to have their ID links fixed.
<?xml version="1.0" encoding="utf-8"?>
feed xmlns="http://www.w3.org/2005/Atom">
<
title>$pagetitle$</title>
<link href="{ARCOLOGY_FEED_PAGE}"/>
<updated>$date$</updated>
<author>
<
$for(author)$name>${author}</name>
<
$endfor$author>
</id>{ARCOLOGY_FEED_PAGE}</id>
<link rel="self" type="application/atom+xml"
< href="{ARCOLOGY_FEED_PAGE}.xml"/>
$for(entries)$entry>
<title type="html">${it.title}</title>
<link href="{ARCOLOGY_FEED_PAGE}#${it.rel}"/>
<id>{ARCOLOGY_FEED_PAGE}#${it.rel}</id>
<updated>${it.pubdate}</updated>
<
$for(it.categories)$category term="${it}" />
<
$endfor$
$for(keywords)$category term="${it}" />
<
$endfor$summary type="html">${it.content}</summary>
<entry>
</
$endfor$
feed> </
And this can be confirmed to work with e.g. The Lion's Rear Site Feed:
pandoc ../thelionsrear_updates.org --lua-filter=arcology/pandoc/make-atom.lua --template=arcology/pandoc/atom.xml -s
Notice that it still has to go through the process of Rewriting and Hydrating that the
HTML docs have to go through so that links and whatnot work. This is
handled in hydrate_feed
above.
Listing the Arcology Feeds
Since the feeds exist in the Arroyo Cache K/V/F store, they can be extracted to shove in to the <head> for example.
This is a poor data modeling, however, and it's like that we will
benefit from an Arroyo Arcology
Generator which extracts ARCOLOGY_FEED
entities to associate them to the page/file they're embedded in.
from typing import List
from sqlmodel import select, Session
from arcology.arroyo import Keyword
from arcology.parse import parse_sexp
from arcology.key import ArcologyKey
These helpers prepare the data for make_feed_entries
. get_feed_keys
will return the list of all ARCOLOGY_FEED
routing keys, and get_feed_files
returns the files associated with
those keys.
def arcology_feed_q():
return select(Keyword).where(Keyword.keyword=='"ARCOLOGY_FEED"')
def get_feed_keys(session) -> List[str]:
return [parse_sexp(row.value) for row in session.exec(arcology_feed_q())]
def get_feed_files(session) -> List[str]:
return [parse_sexp(row.file) for row in session.exec(arcology_feed_q())]
make_feed_entries
exposes just why the
data model is a bit weak.
We have to build the mapping using the return of get_feed_files
so that the feeds' pages' titles
can be applied in the final return value.
We use the site_key
to make sure it's
filtered to only show feeds related to the current Arcology Site. It's certainly
simpler to show all feeds for all
sites, but in the future I may want to have sites which are at least
somewhat hidden, and so showing them in the global feed discovery
mechanism is quite a silly thing to build in. If the site keys don't
match, the title isn't added to the dict…
= dict() # file -> title
feed_page_titles for feed_file in get_feed_files(session):
= Page.from_file(feed_file, session)
p if p.get_site().key == site_key:
= p.get_title() feed_page_titles[feed_file]
If the file isn't set in the feed_page_titles
dict, we know that it's been
skipped. The feed URL is generated using arcology.key.ArcologyKey, and the
title and URL are added to the return list in a tuple.
= list()
ret for feed_key in get_feed_keys(session):
= ArcologyKey(feed_key).to_url()
feed_url = Keyword.get("ARCOLOGY_FEED", feed_key, session=session).filename()
feed_file if feed_page_titles.get(feed_file, None):
ret.append((feed_url, feed_page_titles[feed_file]))
Splat!
def make_feed_entries(site_key: str, session):
<<populateDict>>
<<populateRetVal>>
return ret
INPROGRESS
Arroyo Arcology Generator for
ARCOLOGY_FEED
keys
All of this becomes much simpler with a Arroyo Arcology Generator schema like, maybe, this:
(arcology-feeds
[(file :not-null)
(key :not-null)
(title :not-null)
(site :not-null)
(post-visibility :not-null)
(hash :not-null)])
then things like select(Feed.key, Feed.title).where(Feed.site =
"lionsrear")= is trivial.
Port this code to use arcology.arroyo.Feed, something like:
select(arroyo.Page).where(arroyo.Page.sitekey==sitekey) -> file select(arroyo.Feed).where(arroyo.Feed.file==file)