/* * OpenXML (docx) to text conversion. * Steve Simon, Oct 2012 */ #include #include #include #include #include static void para(Biobuf *bp, Elem *ep); static void object(Biobuf *bp, Elem *ep) { char *v, *p; for(; ep; ep = ep->next) if(strcmp(ep->name, "o:OLEObject") == 0) if((v = xmlvalue(ep, "ProgID")) != nil){ /* trim cruft */ if((p = strrchr(v, '.')) != nil && atoi(p+1) != 0) *p = 0; Bprint(bp, "%s", v); } } static void run(Biobuf *bp, Elem *ep) { for(; ep; ep = ep->next){ if(strcmp(ep->name, "w:drawing") == 0) Bprint(bp, "[drawing]"); if(strcmp(ep->name, "w:pict") == 0) Bprint(bp, "[picture]"); if(strcmp(ep->name, "w:object") == 0 && ep->child){ Bprint(bp, "[object: "); object(bp, ep->child); Bprint(bp, "]"); } if(strcmp(ep->name, "w:t") == 0 && ep->pcdata) Bprint(bp, "%s", ep->pcdata); if(strcmp(ep->name, "w:tab") == 0) Bprint(bp, "\t"); } } static void cell(Biobuf *bp, Elem *ep) { int first; first = 1; for(; ep; ep = ep->next) if(strcmp(ep->name, "w:p") == 0 && ep->child){ if(! first) Bprint(bp, " "); first = 0; para(bp, ep->child); } } static void row(Biobuf *bp, Elem *ep) { for(; ep; ep = ep->next){ if(strcmp(ep->name, "w:tc") == 0){ Bprint(bp, "\t"); cell(bp, ep->child); } } Bprint(bp, "\n"); } static void table(Biobuf *bp, Elem *ep) { for(; ep; ep = ep->next) if(strcmp(ep->name, "w:tr") == 0 && ep->child) row(bp, ep->child); } static void paraprops(Biobuf *bp, Elem *ep) { for(; ep; ep = ep->next) if(strcmp(ep->name, "w:numPr") == 0 && ep->child) Bprint(bp, " • "); } static void para(Biobuf *bp, Elem *ep) { for(; ep; ep = ep->next){ if(strcmp(ep->name, "w:pPr") == 0 && ep->child) paraprops(bp, ep->child); if(strcmp(ep->name, "w:r") == 0 && ep->child) run(bp, ep->child); if(strcmp(ep->name, "w:hyperlink") == 0 && ep->child) para(bp, ep->child); /* autogenerated fields, e.g. auto numbered Figures */ if(strcmp(ep->name, "w:fldSimple") == 0 && ep->child) para(bp, ep->child); } } static void body(Biobuf *bp, Elem *ep) { for(; ep; ep = ep->next){ if(strcmp(ep->name, "w:p") == 0 && ep->child){ para(bp, ep->child); Bprint(bp, "\n\n"); } if(strcmp(ep->name, "w:tbl") == 0 && ep->child){ Bprint(bp, "\n"); table(bp, ep->child); } } } static void usage(void) { fprint(2, "usage: %s [-n] [file.xml]\n", argv0); exits("usage"); } void main(int argc, char *argv[]) { int fd; Xml *xp; Elem *ep; char *err; Biobuf bout; err = nil; ARGBEGIN{ case 'd': xmldebug++; break; default: usage(); }ARGEND; if(argc == 0){ if((xp = xmlparse(0, 8192, 0)) == nil) sysfatal("stdin: %r\n"); } else{ if((fd = open(argv[0], OREAD)) == -1) sysfatal("%s cannot open\n", argv[0]); if((xp = xmlparse(fd, 8192, 0)) == nil) sysfatal("%s: %r\n", argv[0]); close(fd); } Binit(&bout, 1, OWRITE); if((ep = xmllook(xp->root, "/w:document/w:body", nil, nil)) == nil || ep->child == nil) err = "bad xml format"; body(&bout, ep->child); Bterm(&bout); xmlfree(xp); exits(err); }