Source code for wwt_data_formats.filecabinet
# -*- mode: python; coding: utf-8 -*-
# Copyright 2019-2020 the .NET Foundation
# Licensed under the MIT License.
from __future__ import absolute_import, division, print_function
__all__ = '''
FileCabinetReader
FileCabinetWriter
'''.split()
from collections import namedtuple, OrderedDict
from xml.etree import ElementTree as etree
from . import stringify_xml_doc
ReaderFileInfo = namedtuple('ReaderFileInfo', 'name offset size')
[docs]
class FileCabinetReader(object):
"""Reader for a simple container format for other files.
Unlike most of the other WWT data formats implemented in this package, the
file cabinet is not an XML serialization of a data structure. Instead,
it's a container format for bundling files together.
"""
_stream = None
_files = None
def __init__(self, stream):
self._stream = stream
self._files = {}
# We assume that the stream is going to be accessed randomly. There
# might be cases in which the access is sequential, in which case we
# could avoid the seeks and operate on pipes and the like, but we'll
# cross that bridge if/when we get there -- this format is simple.
stream.seek(0)
# It seems safest to figure out the bounds of the header by
# searching for the text of the form `HeaderSize="0x000001b2"`.
# TODO: explicit handling if the magic string isn't found or if the
# header size is implausible.
header = stream.read(256)
idx = header.index(b'HeaderSize="')
size_hex = header[idx+12:idx+22]
header_size = int(size_hex, 16)
# Now we can read the full header and parse it.
stream.seek(0)
header = stream.read(header_size)
header_doc = etree.fromstring(header)
for file in header_doc.find('Files').iterfind('File'):
name = file.get('Name')
rel_offset = file.get('Offset')
size = file.get('Size')
if name is None or rel_offset is None or size is None:
raise Exception('incomplete File record in FileCabinet')
try:
rel_offset = int(rel_offset)
size = int(size)
except Exception as e:
raise Exception('malformed Offset or Size in File record in FileCabinet')
# TODO: handle names with bad characters, implausible offsets and sizes, etc.
if name in self._files:
raise Exception('duplicated File record "{}" in FileCabinet'.format(name))
self._files[name] = ReaderFileInfo(name, header_size + rel_offset, size)
[docs]
def close(self):
"""Close the underlying stream, making this object essentially unusable."""
self._stream.close()
self._stream = None
[docs]
def filenames(self):
"""Return an iterable of the names of the files in this cabinet."""
return self._files.keys()
[docs]
def read_file(self, filename):
"""Read the specified file into memory in its entirety and return its contents.
Returns bytes.
"""
if self._stream is None:
raise Exception('cannot read file "{}" with a closed FileCabinetReader'.format(filename))
info = self._files.get(filename)
if info is None:
raise Exception('no such file "{}" in FileCabinet'.format(filename))
self._stream.seek(info.offset)
return self._stream.read(info.size)
WriterFileInfo = namedtuple('WriterFileInfo', 'name size contents')
[docs]
class FileCabinetWriter(object):
"""Writer for a simple container format for other files.
One day, we should support the ability to stream data into a cabinet
without having to grossly buffer everything in memory. But today is not
that day.
"""
_files = None
def __init__(self):
self._files = OrderedDict()
[docs]
def add_file_with_data(self, name, data):
"""Add a file whose contents are stored in an in-memory buffer.
The *data* argument should be a bytes object.
"""
if not isinstance(data, bytes):
raise ValueError('the data argument must be an instance of the bytes type')
if name in self._files:
raise ValueError('a file named \"{}\" has already been added'.format(name))
size = len(data)
self._files[name] = WriterFileInfo(name, size, data)
[docs]
def filenames(self):
"""Return an iterable of the names of the files in this cabinet."""
return self._files.keys()
[docs]
def emit(self, stream):
"""Write out the contents of this cabinet to the target stream.
"""
# Create the header structure.
SIZE_PLACEHOLDER = '0xZYXWVUTS'
cabinet = etree.Element('FileCabinet')
cabinet.set('HeaderSize', SIZE_PLACEHOLDER)
files = etree.SubElement(cabinet, 'Files')
offset = 0
for info in self._files.values():
f = etree.SubElement(files, 'File')
f.set('Name', info.name)
f.set('Size', str(info.size))
f.set('Offset', str(offset))
offset += info.size
# Serialize and patch in the actual header size. With a
# non-pathological XML serialization, the HeaderSize item will occur
# within the first SIZE_REGION bytes while the first filename will
# occur beyond the first SIZE_REGION bytes, meaning that we'll be
# resistant if someone tries to break us by using a filename that
# includes SIZE_PLACEHOLDER.
SIZE_REGION = 90
header = stringify_xml_doc(cabinet, indent=True)
header = header.encode('utf-8')
size_ascii = '0x{:08x}'.format(len(header)).encode('us-ascii')
filled_size = header[:SIZE_REGION].replace(SIZE_PLACEHOLDER.encode('us-ascii'), size_ascii)
header = filled_size + header[SIZE_REGION:]
stream.write(header)
# The rest is straightforward.
for info in self._files.values():
stream.write(info.contents)