Convert basic HTML tags to Google Doc styles using App Scripts

2017/02/15
Wednesday

I have a script which creates a Google Doc every day. In it I write thoughts from the day, as well as any dreams I had the previous night (in pursuit of lucid dreaming). When I began writing the script, the contents of the Google Doc were generated imperatively with code, according to a static format. As days rolled by, I noticed myself writing the same section headers over and over. Finally, I decided to automate that business.

I didn't want to reread Google's API docs every time I changed the template, so I figured I'd use HTML. Unfortunately, Google didn't have an HTML to Google Doc conversion function, so I wrote a little helper to transform HTML to Google Doc Text styles, at least for some basic tags. (Luckily, Google exposes XmlService to do the heavy lifting of parsing XML.)

var TAG_STYLE_ATTRIBUTE_MAP = {
  // Old html tags
  i: DocumentApp.Attribute.ITALIC,
  b: DocumentApp.Attribute.BOLD,
  u: DocumentApp.Attribute.UNDERLINE,
  s: DocumentApp.Attribute.STRIKETHROUGH,

  // Newer semantic tags
  em: DocumentApp.Attribute.ITALIC,
  strong: DocumentApp.Attribute.BOLD,
  strike: DocumentApp.Attribute.STRIKETHROUGH
};

// NOTE: Paragraph.appendText takes on the styles of the preceding Text.

// This object stores the empty, vanilla, tabula rasa state
// a Text object should have, so we can build from scratch.
var DEFAULT_ATTRIBUTES = {};

(function _fillDefaultAttributes() {
  // Immediately-invoked anonymous function, 
  // so we don't mess up the global namespace
  for (var key in TAG_STYLE_ATTRIBUTE_MAP) {
    var attr = TAG_STYLE_ATTRIBUTE_MAP[key];
    DEFAULT_ATTRIBUTES[attr] = false;
  }
})();


/**
 * Parse basic HTML tags (like b, i, and u) to Google Docs format,
 * appending to the specified paragraph.
 */
function appendHtmlToParagraph(p, html) {
  var doc = XmlService.parse('<root>' + html + '</root>');
  var root = doc.getRootElement();
  _fillParagraphFromHtmlElements(p, root);
}

function _fillParagraphFromHtmlElements(p, root, styles) {
  if (styles == null) {
    styles = [];
  }

  var contents = root.getAllContent();
  for (var i=0; i<contents.length; i++) {
    var node = contents[i];
    var nodeType = node.getType();

    if (nodeType == 'ELEMENT') {
      var childStyles = styles.slice();
      var attr = TAG_STYLE_ATTRIBUTE_MAP[node.asElement().getName()];
      // Silently ignore unknown tags -- for better or worse
      if (attr) {
        childStyles = childStyles.concat(attr);
      }
      _fillParagraphFromHtmlElements(p, node, childStyles);
    } 

    else if (nodeType == 'TEXT') {
      var text = p.appendText(node.getValue());
      var attributes = Object.assign({}, DEFAULT_ATTRIBUTES);
      for (var j=0; j< styles.length; j++) {
        attributes[styles[j]] = true;
      }
      text.setAttributes(attributes);
    }
  }
}

You'll need a Paragraph object to fill. AFAIK, Google doesn't let you instantiate Paragraph objects willy-nilly. You'll have to either create one with

var p = DocumentApp.getActiveDocument().getBody().appendParagraph('');
// btw, when the hell will we get to use "let"? ;)

Or get one with something like:

var p = DocumentApp.getActiveDocument().getBody().getParagraphs()[0]

Once you've got yourself a Paragraph, you can run:

appendHtmlToParagraph(p, '<i>your <u>html</u></i> <b>here</b>')



Well, I hope that helped save you a few seconds :P