In which I finally learn what’s in a TrueType font

LMNOP Last week someone posted an interestingly bizarre problem in the LilyPond newsgroup: using Times New Roman on Vista, the letter N becomes “Ị.” Go figure. Debugging that seemed like a fun puzzle, so I looked into it a bit, and concluded that there was a bug in the font. Someone who knows more than I do diagnosed it more completely: it turns out that the ‘post’ table assigns the name ‘N’ to three different characters, confusing LilyPond (or pango, or freetype, or whatever). Microsoft already knows that, but have no plans to do anything, presumably because Microsoft software doesn’t use the post table, and Microsoft doesn’t care about any stinkin’ software other than their own.

For reasons that escape me, this was enough to inspire me to learn what’s inside a TrueType font. The format is, not surprisingly, both simple and and Byzantine. I’ve cobbled together a python program to fix the problem with MS’s TNR. In case anyone is curious, it’s below the fold. For heaven’s sake don’t assume it won’t ruin anything you run through it.

from __future__ import with_statement
from __future__ import division

import sys
import struct
import copy
import StringIO

from contextlib import contextmanager
from optparse import OptionParser

macnamedict = {
0: ".notdef",
1: ".null",
2: "nonmarkingreturn",
3: "space",
4: "exclam",
5: "quotedbl",
6: "numbersign",
7: "dollar",
8: "percent",
9: "ampersand",
10: "quotesingle",
11: "parenleft",
12: "parenright",
13: "asterisk",
14: "plus",
15: "comma",
16: "hyphen",
17: "period",
18: "slash",
19: "zero",
20: "one",
21: "two",
22: "three",
23: "four",
24: "five",
25: "six",
26: "seven",
27: "eight",
28: "nine",
29: "colon",
30: "semicolon",
31: "less",
32: "equal",
33: "greater",
34: "question",
35: "at",
36: "A",
37: "B",
38: "C",
39: "D",
40: "E",
41: "F",
42: "G",
43: "H",
44: "I",
45: "J",
46: "K",
47: "L",
48: "M",
49: "N",
50: "O",
51: "P",
52: "Q",
53: "R",
54: "S",
55: "T",
56: "U",
57: "V",
58: "W",
59: "X",
60: "Y",
61: "Z",
62: "bracketleft",
63: "backslash",
64: "bracketright",
65: "asciicircum",
66: "underscore",
67: "grave",
68: "a",
69: "b",
70: "c",
71: "d",
72: "e",
73: "f",
74: "g",
75: "h",
76: "i",
77: "j",
78: "k",
79: "l",
80: "m",
81: "n",
82: "o",
83: "p",
84: "q",
85: "r",
86: "s",
87: "t",
88: "u",
89: "v",
90: "w",
91: "x",
92: "y",
93: "z",
94: "braceleft",
95: "bar",
96: "braceright",
97: "asciitilde",
98: "Adieresis",
99: "Aring",
100: "Ccedilla",
101: "Eacute",
102: "Ntilde",
103: "Odieresis",
104: "Udieresis",
105: "aacute",
106: "agrave",
107: "acircumflex",
108: "adieresis",
109: "atilde",
110: "aring",
111: "ccedilla",
112: "eacute",
113: "egrave",
114: "ecircumflex",
115: "edieresis",
116: "iacute",
117: "igrave",
118: "icircumflex",
119: "idieresis",
120: "ntilde",
121: "oacute",
122: "ograve",
123: "ocircumflex",
124: "odieresis",
125: "otilde",
126: "uacute",
127: "ugrave",
128: "ucircumflex",
129: "udieresis",
130: "dagger",
131: "degree",
132: "cent",
133: "sterling",
134: "section",
135: "bullet",
136: "paragraph",
137: "germandbls",
138: "registered",
139: "copyright",
140: "trademark",
141: "acute",
142: "dieresis",
143: "notequal",
144: "AE",
145: "Oslash",
146: "infinity",
147: "plusminus",
148: "lessequal",
149: "greaterequal",
150: "yen",
151: "mu",
152: "partialdiff",
153: "summation",
154: "product",
155: "pi",
156: "integral",
157: "ordfeminine",
158: "ordmasculine",
159: "Omega",
160: "ae",
161: "oslash",
162: "questiondown",
163: "exclamdown",
164: "logicalnot",
165: "radical",
166: "florin",
167: "approxequal",
168: "Delta",
169: "guillemotleft",
170: "guillemotright",
171: "ellipsis",
172: "nonbreakingspace",
173: "Agrave",
174: "Atilde",
175: "Otilde",
176: "OE",
177: "oe",
178: "endash",
179: "emdash",
180: "quotedblleft",
181: "quotedblright",
182: "quoteleft",
183: "quoteright",
184: "divide",
185: "lozenge",
186: "ydieresis",
187: "Ydieresis",
188: "fraction",
189: "currency",
190: "guilsinglleft",
191: "guilsinglright",
192: "fi",
193: "fl",
194: "daggerdbl",
195: "periodcentered",
196: "quotesinglbase",
197: "quotedblbase",
198: "perthousand",
199: "Acircumflex",
200: "Ecircumflex",
201: "Aacute",
202: "Edieresis",
203: "Egrave",
204: "Iacute",
205: "Icircumflex",
206: "Idieresis",
207: "Igrave",
208: "Oacute",
209: "Ocircumflex",
210: "apple",
211: "Ograve",
212: "Uacute",
213: "Ucircumflex",
214: "Ugrave",
215: "dotlessi",
216: "circumflex",
217: "tilde",
218: "macron",
219: "breve",
220: "dotaccent",
221: "ring",
222: "cedilla",
223: "hungarumlaut",
224: "ogonek",
225: "caron",
226: "Lslash",
227: "lslash",
228: "Scaron",
229: "scaron",
230: "Zcaron",
231: "zcaron",
232: "brokenbar",
233: "Eth",
234: "eth",
235: "Yacute",
236: "yacute",
237: "Thorn",
238: "thorn",
239: "minus",
240: "multiply",
241: "onesuperior",
242: "twosuperior",
243: "threesuperior",
244: "onehalf",
245: "onequarter",
246: "threequarters",
247: "franc",
248: "Gbreve",
249: "gbreve",
250: "Idotaccent",
251: "Scedilla",
252: "scedilla",
253: "Cacute",
254: "cacute",
255: "Ccaron",
256: "ccaron",
257: "dcroat",
}
macnameset = set(macnamedict[i] for i in macnamedict)

class TTFHeader(object):
    """Things stored in a ttf header"""
    def __init__(self, reader):
        """Initialize from a reader"""
        self.sfntver = reader.readFixed()
        self.ntables = reader.readUShort()
        self.searchRange = reader.readUShort()
        self.entrySelector = reader.readUShort()
        self.rangeShift = reader.readUShort()

class Table(object):
    """A ttf table header"""
    def __init__(self, reader):
        """Initialize from a Reader"""
        self.tag = reader.readTag()
        self.checksum = reader.readULong()
        self.offset = reader.readULong()
        self.length = reader.readULong()

class Reader(object):
    def __init__(self, istrm):
        self.istrm_ = istrm
        pos = 0

    def pos(self):
        return self.istrm_.tell()

    def seek(self, p):
        return self.istrm_.seek(p)

    def pad(self, pad = 4):
        p = self.pos() % pad
        if p:
            self.istrm_.read(pad - p)

    def readBytes(self, n):
        return self.istrm_.read(n)

    def readSomething(self, fmt):
        sz = struct.calcsize(fmt)
        bytes = self.istrm_.read(sz)
        (ret,) = struct.unpack(fmt, bytes)
        return ret

    def readUShort(self):
        return self.readSomething(">H")

    def readShort(self):
        return self.readSomething(">h")

    def readULong(self):
        return self.readSomething(">L")

    def readLong(self):
        return self.readSomething(">l")

    def readTag(self):
        return self.istrm_.read(4)

    def readFixed(self):
        i0 = self.readSomething(">l")
        return i0 / (2 ** 16)

    def readFWORD(self):
        return self.readShort()

    def readPascalString(self):
        l = ord(self.istrm_.read(1))
        return self.istrm_.read(l)

class Writer(object):
    def __init__(self, ostrm):
        self.ostrm_ = ostrm
        pos = 0

    def pos(self):
        return self.ostrm_.tell()

    def seek(self, p):
        return self.ostrm_.seek(p)

    def pad(self, pad = 4):
        p = self.pos() % pad
        if p:
            self.ostrm_.write('' * (pad - p))

    def writeBytes(self, b):
        return self.ostrm_.write(b)

    def writeSomething(self, fmt, d):
        b= struct.pack(fmt, d)
        return self.ostrm_.write(b)

    def writeUShort(self, d):
        return self.writeSomething(">H", d)

    def writeShort(self, d):
        return self.writeSomething(">h", d)

    def writeULong(self, d):
        return self.writeSomething(">L", d)

    def writeLong(self, d):
        return self.writeSomething(">l", d)

    def writeTag(self, d):
        return self.ostrm_.write(d)

    def writeFixed(self, d):
        return self.writeSomething(">l", d * 0x10000)

    def writeFWORD(self, d):
        return self.writeShort(d)

    def writePascalString(self, d):
        sz = len(d)
        self.ostrm_.write(chr(sz))
        self.ostrm_.write(d)

    # forward anything else to the ostrm
    def __getattr__(self, name):
        return getattr(self.ostrm_, name)

class ChecksumOstrm(object):
    """A fake stream for computing checksums."""
    def __init__(self):
        self.sum = 0
        self.count = 0

    def tell(self):
        return self.count

    def write(self, b):
        for i, c in enumerate(b):
            o = 3 - (i + self.count) % 4
            l = ord(c) << (8*o)
            self.sum += l
        self.count += len(b)

    def checksum(self):
        return self.sum % 0x100000000

class NullOstrm(object):
    """A fake stream that only records the number of bytes written."""
    def __init__(self):
        self.count = 0

    def tell(self):
        return self.count

    def write(self, b):
        self.count += len(b)

def getName(glyphNameIndex, name, i):
    idx = glyphNameIndex&#91;i&#93;
    if idx < 258:
        return macnamedict&#91;idx&#93;
    else:
        return name&#91;idx-258&#93;

def processPostTable(t, fix = False):
    strm = StringIO.StringIO(t.bytes)
    reader = Reader(strm)
    ver = reader.readFixed()
    italicAngle = reader.readFixed()
    underlinePos = reader.readFWORD()
    underlineThickness = reader.readFWORD()
    fixedPitch = reader.readULong()
    # some memory stuff
    mstuff = &#91;reader.readULong() for i in range(0, 4)&#93;

    if ver == 2.0:
        nglyphs = reader.readUShort()
        glyphNameIndex = &#91;reader.readUShort() for i in range(0, nglyphs)&#93;
        pos = reader.pos()
        nnew = max(*glyphNameIndex) - 257
        names = &#91;reader.readPascalString() for i in range(0, nnew)&#93;
        indexset = set()
        badNames = &#91;&#93;
        for i, idx in enumerate(glyphNameIndex):
            if idx in indexset:
                if idx < 258:
                    n = macnamedict&#91;idx&#93;
                else:
                    n = names&#91;idx-258&#93;
                print "repeated name", n, i
                badNames.append(i)
            else:
                indexset.add(idx)
        for i, n in enumerate(names):
            if n in macnameset:
                print "bad name", n
                # fixing this seems to be a bit problematic...
#                badNames.append(i)

        if fix and badNames:
            for i in badNames:
                idx = len(names) + 258
                glyphNameIndex&#91;i&#93; = idx
                newName = 'glyph%d' % i
                names.append(newName)
            # now write the new bytes
            writer = Writer(StringIO.StringIO())
            writer.writeFixed(ver)
            writer.writeFixed(italicAngle)
            writer.writeFWORD(underlinePos)
            writer.writeFWORD(underlineThickness)
            writer.writeULong(fixedPitch)
            for m in mstuff:
                writer.writeULong(m)

            writer.writeUShort(nglyphs)
            for idx in glyphNameIndex:
                writer.writeUShort(idx)
            for name in names:
                writer.writePascalString(name)

            # now replace the bytes and recalc the checksum
            oldbytes = t.bytes
            t.bytes = writer.getvalue()
            t.length = len(t.bytes)
            t.checksum = calcChecksum(t.bytes)

# cmap subtable format 4 translator
class Cmap4(object):
    def __init__(self, reader):
        pos = reader.pos()
        self.format = reader.readUShort()
        self.length = reader.readUShort()
        self.language = reader.readUShort()
        self.segCountX2 = reader.readUShort()
        self.searchRange = reader.readUShort()
        self.entrySelector = reader.readUShort()
        self.rangeShift = reader.readUShort()

        segCount = self.segCountX2 // 2
        self.endCount = &#91;reader.readUShort() for i in range(0, segCount)&#93;
        # pad
        reader.readUShort()
        self.startCount = &#91;reader.readUShort() for i in range(0, segCount)&#93;
        self.idDelta = &#91;reader.readShort() for i in range(0, segCount)&#93;
        # this includes both range offset and glyph array
        # because of the rather baroque algorithm
        sofar = reader.pos() - pos
        sz = (self.length - sofar) // 2
        self.idRangeOffset = &#91;reader.readUShort() for i in range(0, sz)&#93;

    def lookup(self, cp):
        # find the end segment
        n = self.segCountX2 // 2
        for i, e in enumerate(self.endCount):
            if cp <= e:
                break
        else:
            return 0
        if cp < self.startCount&#91;i&#93;:
            return 0
        offset = self.idRangeOffset&#91;i&#93;
        if offset != 0:
            hack = i + self.idRangeOffset&#91;i&#93; // 2 + (cp - self.startCount&#91;i&#93;)
            index = self.idRangeOffset&#91;hack&#93;
            if index:
                return (index + self.idDelta&#91;i&#93;) &amp; 0xffff
            else:
                return 0
        else:
            return (cp + self.idDelta&#91;i&#93;) &amp; 0xffff

def processCmapTable(t):
    strm = StringIO.StringIO(t.bytes)
    reader = Reader(strm)
    version = reader.readUShort()
    nsub = reader.readUShort()

    encrecs = &#91;&#93;
    offsets = set()
    # read encoding records
    for i in range(0, nsub):
        platformId = reader.readUShort()
        encodingId = reader.readUShort()
        offset = reader.readULong()
        if offset not in offsets:
            offsets.add(offset)
            encrecs.append((platformId, encodingId, offset))

    encrecs.sort(key = lambda (p, e, o): o)

    for platId, encId, offset in encrecs:
        reader.seek(offset)
        if reader.pos() != offset:
            print "cmap subtable offset mismatch"
        format = reader.readUShort()
        length = reader.readUShort()
        bytes = reader.readBytes(length-4)
#        print "cmap subtable %d %d, format %d" % (platId, encId, format)
        if format == 4:
            reader.seek(offset)
            cmap4 = Cmap4(reader)

def calcChecksum(bytes):
    w = Writer(ChecksumOstrm())
    w.writeBytes(bytes)
    return w.checksum()

def calcFileChecksum(h):
    writer = Writer(ChecksumOstrm())
    csa = getHeadChecksumAdj(h)
    setHeadChecksumAdj(h, 0)
    writeTTFFile(h, writer)
    setHeadChecksumAdj(h, csa)
    return writer.checksum()

def getHeadChecksumAdj(ttfh):
    t = ttfh.tabledict&#91;'head'&#93;
    (csa,) = struct.unpack_from('>L', t.bytes, 8)
    return csa

def setHeadChecksumAdj(ttfh, csa):
    t = ttfh.tabledict['head']
    newb = struct.pack('>L', csa)
    t.bytes = t.bytes[0: 8] + newb + t.bytes[12:]

def writeTableHeader(t, writer):
    writer.writeTag(t.tag)
    writer.writeULong(t.checksum)
    writer.writeULong(t.offset)
    writer.writeULong(t.length)

def writeTTFHeader(h, writer):
    writer.writeFixed(h.sfntver)
    writer.writeUShort(h.ntables)
    writer.writeUShort(h.searchRange)
    writer.writeUShort(h.entrySelector)
    writer.writeUShort(h.rangeShift)

def writeTTFFile(h, writer):
    writeTTFHeader(h, writer)
    for t in h.tables:
        writeTableHeader(t, writer)
    # now the bytes
    for t in h.tablesInOrder:
        writer.pad()
        writer.writeBytes(t.bytes)

def updateTableOffsets(h):
    writer = Writer(NullOstrm())
    writeTTFHeader(h, writer)
    for t in h.tables:
        writeTableHeader(t, writer)
    # now the bytes
    for t in h.tablesInOrder:
        writer.pad()
        t.offset = writer.pos()
        writer.writeBytes(t.bytes)

def processFontFile(instrm, fixRepeats = False, ostrm = None):
    reader = Reader(instrm)
    # read the header
    ttfh = TTFHeader(reader)
    if ttfh.rangeShift != 16 * ttfh.ntables - ttfh.searchRange:
        print "bogus header info"
    ntables = ttfh.ntables

    # read the headers
    ttfh.tables = []
    ttfh.tabledict = {}
    for idx in range(0, ntables):
        t = Table(reader)
        ttfh.tabledict[t.tag] = t
        ttfh.tables.append(t)

    ttfh.tablesInOrder = copy.copy(ttfh.tables)
    ttfh.tablesInOrder.sort(
            key = lambda x: x.offset)
    for t in ttfh.tablesInOrder:
        reader.pad()
        t.bytes = reader.readBytes(t.length)
        if False and t.tag != 'head':
            cs = calcChecksum(t.bytes)
            if cs != t.checksum:
                print "Bad checksum for %s: %d calculated vs %d" % \
                        (t.tag, cs, t.checksum)

    processPostTable(ttfh.tabledict['post'], fixRepeats)
    processCmapTable(ttfh.tabledict['cmap'])

    if fixRepeats:
        # fix the offsets
        updateTableOffsets(ttfh)
        # calculate new checksum, and write a file
        csacalc = (0xB1B0AFBA - calcFileChecksum(ttfh)) &amp; 0xffffffff
        setHeadChecksumAdj(ttfh, csacalc)
        if ostrm:
            writeTTFFile(ttfh, Writer(ostrm))
    else:
        csa = getHeadChecksumAdj(ttfh)
        csacalc = 0xB1B0AFBA - calcFileChecksum(ttfh)

def main(args):
    print "Use this at your own risk!"
    parser = OptionParser()
    parser.add_option("-i", "--in", dest="infile",
                      help="read data from FILE", metavar="FILE")
    parser.add_option("-o", "--out", dest="outfile",
                      help="write data to FILE", metavar="FILE")
    parser.add_option("-f", "--fix", dest="fix",
                      action="store_true", default=False,
                      help="Fix repeats and bad names and output a new file")

    options, args = parser.parse_args()

    if options.infile:
        with open(options.infile, "rb") as instrm:
            if options.outfile:
                ostrm = open(options.outfile, "wb")
            else:
                ostrm = None
            processFontFile(instrm, options.fix, ostrm)
    else:
        print "No file specified"

if __name__ == "__main__":
    main(sys.argv)

Advertisements

Tags: , ,

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s


%d bloggers like this: