原创

[poi-tl]转换html内容到word

利用jsoup将html解析,递归解析,将不同的html标签内容转换成poi支持的内容。代码仅供参考。

完整代码示例请参见:
https://gitee.com/xuwangcheng/poi-tl-html-to-word

package yi.master.demo;

import cn.hutool.core.collection.CollUtil;

import cn.hutool.core.io.FileUtil;

import cn.hutool.core.util.ReUtil;

import com.deepoove.poi.NiceXWPFDocument;

import com.deepoove.poi.XWPFTemplate;

import com.deepoove.poi.config.Configure;

import com.deepoove.poi.data.MiniTableRenderData;

import com.deepoove.poi.data.PictureRenderData;

import com.deepoove.poi.data.TextRenderData;

import com.deepoove.poi.policy.AbstractRenderPolicy;

import com.deepoove.poi.policy.PictureRenderPolicy;

import com.deepoove.poi.policy.TextRenderPolicy;

import com.deepoove.poi.render.RenderContext;

import com.deepoove.poi.template.run.RunTemplate;

import com.deepoove.poi.util.TableTools;

import org.apache.commons.lang3.StringUtils;

import org.apache.poi.xwpf.usermodel.*;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Element;

import org.jsoup.nodes.Node;

import org.jsoup.select.Elements;

import org.openxmlformats.schemas.wordprocessingml.x2006.main.STMerge;

import javax.imageio.ImageIO;

import java.awt.image.BufferedImage;

import java.io.File;

import java.io.IOException;

import java.io.InputStream;

import java.util.*;

/**

 * @author xuwangcheng

 * @version 1.0.0

 * @description

 * @date 2019/11/21 9:31

 */

public class Demo {

    public static void main(String[] args) throws IOException {

        String html = FileUtil.readString("demo.html", "utf-8");

        //配置

        Configure config = Configure.newBuilder().build();

        config.customPolicy("resultHtml", createHtmlRenderPolicy());

        //创建word模板对象

        Map<String, Object> map = new HashMap<String, Object>();

        map.put("top", "TOPPPPP");

        map.put("resultHtml", html);

        map.put("buttom", "buttommmmmmmmmmmmmm");

        XWPFTemplate template = XWPFTemplate.compile(getResourceInputStream("/out_template.docx"), config).render(map);

        template.writeToFile("D:\\demo.docx");

        template.close();

    }

    /**

 * 获取资源文件的文件流

 *

 * @return

 */

    public static InputStream getResourceInputStream(String filePath) {

        InputStream in = FileUtil.class.getResourceAsStream(filePath);

        if (in != null) {

            return in;

        }

        return null;

    }

    /**

 * 创建测试用例过程记录的渲染策略:解析对应的html并输出到word

 * @author xuwangcheng

 * @date 2019/7/26 10:10

 * @param

 * @return {@link AbstractRenderPolicy}

 */

    private static AbstractRenderPolicy createHtmlRenderPolicy() {

        return new AbstractRenderPolicy() {

            @Override

            protected void afterRender(RenderContext context) {

                // 清空模板标签所在段落

                clearPlaceholder(context, true);

            }

            @Override

            public void doRender(RunTemplate runTemplate, Object data, XWPFTemplate template) throws Exception {

                if (data == null || StringUtils.isBlank(data.toString())) {

                    return;

                }

                //获得Apache POI增强类NiceXWPFDocument

                NiceXWPFDocument doc = template.getXWPFDocument();

                String html = data.toString();

                html = html.replaceAll(">", ">")

                        .replaceAll("<", "<")

                        .replaceAll(" ", " ")

                        .replaceAll("\\n", "")

                        .replaceAll("", "\n");

                org.jsoup.nodes.Document htmlDoc = Jsoup.parse(html);

                Elements nodes = htmlDoc.body().children();

                XWPFParagraph xwpfParagraph = runTemplate.getRun().getParagraph();

                ListIterator<Element> itr = nodes.listIterator();

                while (itr.hasNext()) {

                    Element e = itr.next();

                    xwpfParagraph = parseHtmlToWord(e, doc, xwpfParagraph, true);

                }

            }

        };

    }

    /**

 * 转换整个html内容为word内容

 * @author xuwangcheng

 * @date 2019/7/29 18:46

 * @param ele ele

 * @param doc doc

 * @param xwpfParagraph xwpfParagraph

 * @return {@link XWPFParagraph}

 */

    private static XWPFParagraph parseHtmlToWord(Element ele, NiceXWPFDocument doc, XWPFParagraph xwpfParagraph

            , boolean isParent) throws Exception {

        //处理img图片

        if ("img".equals(ele.tagName())) {

            parseImgToWord(ele.attr("src"), xwpfParagraph);

            return xwpfParagraph;

        }

        //处理table标签

        if ("table".equals(ele.tagName())) {

            xwpfParagraph = doc.insertNewParagraph(CollUtil.getLast(xwpfParagraph.getRuns()));

            parseTableToWord(doc, ele, xwpfParagraph.createRun());

            //有表格的话新建段落

            //xwpfParagraph = doc.createParagraph();

            return xwpfParagraph;

        }

        //处理标签 上标

        if ("sup".equalsIgnoreCase(ele.tagName())) {

            XWPFRun run = xwpfParagraph.createRun();

            run.setText(ele.text());

            // 设置字体加粗;

            run.setBold(true);

            // 设置字体大小;

            run.setFontSize(12);

            run.setFontFamily("Times New Roman", XWPFRun.FontCharRange.ascii);

            run.setFontFamily("宋体", XWPFRun.FontCharRange.eastAsia);

            run.setSubscript(VerticalAlign.SUPERSCRIPT);

            TextRenderPolicy.Helper.renderTextRun(run, new TextRenderData(ele.text()));

            return xwpfParagraph;

        }

        //处理其他文本标签

        String text = ele.ownText();

        boolean continueItr = true;

        //span标签默认全部为文字,不再继续迭代

        if ("span".equalsIgnoreCase(ele.tagName())) {

            text = ele.wholeText();

            continueItr = false;

        }

        boolean enabledBreak = (isParent || StringUtils.isNotBlank(ele.text()))

                && ReUtil.isMatch("(p|h[12345]|li|img)", ele.tagName());

        if (enabledBreak) {

            XWPFRun run = xwpfParagraph.createRun();

            run.addBreak();

        }

        if (StringUtils.isNotBlank(text)) {

            XWPFRun run = xwpfParagraph.createRun();

            TextRenderPolicy.Helper.renderTextRun(run, new TextRenderData(text));

        }

        if (continueItr && ele.children().size() > 0) {

            ListIterator<Element> itr = ele.children().listIterator();

            while (itr.hasNext()) {

                Element me = itr.next();

                xwpfParagraph = parseHtmlToWord(me, doc, xwpfParagraph, false);

            }

        }

        return xwpfParagraph;

    }

    /**

 * 转换图片为word内容

 * @author xuwangcheng

 * @date 2019/7/29 18:45

 * @param imgUrl imgUrl

 * @param xwpfParagraph xwpfParagraph

 */

    private static void parseImgToWord(String imgUrl, XWPFParagraph xwpfParagraph) throws Exception {

        //获取图片本地路径

       String imgRealPath = getImgRealPath(imgUrl);

       if (StringUtils.isBlank(imgRealPath) || !FileUtil.exist(imgRealPath)) {

           return;

       }

        //插入图片

        //获取图片对象

        BufferedImage img = ImageIO.read(new File(imgRealPath));

        //获得图片的宽

        int width = img.getWidth();

        //获得图片的高

        int height = img.getHeight();

        if (width > 600) {

            //获取比例

            int rate = (width / 600 ) + 1;

            width = width / rate - 20;

            height = height / rate;

        }

        PictureRenderData pictureRenderData = new PictureRenderData(width, height, imgRealPath);

        XWPFRun run = xwpfParagraph.createRun();

        PictureRenderPolicy.Helper.renderPicture(run, pictureRenderData);

    }

    /**

 * 通过imgUrl获取本地图片路径

 * @author xuwangcheng

 * @date 2019/11/21 9:47

 * @param imgUrl imgUrl

 * @return {@link String}

 */

    private static String getImgRealPath (String imgUrl) {

        //TODO 获取real_path

        return "G:\\1508490175_417949.jpg";

    }

    /**

 * 转换表格为word内容

 * @author xuwangcheng

 * @date 2019/7/29 18:45

 * @param doc doc

 * @param ele ele

 */

    private static void parseTableToWord(NiceXWPFDocument doc, Element ele, XWPFRun run) throws Exception {

        //简化表格html

        org.jsoup.nodes.Document tableDoc = Jsoup.parse(simplifyTable(ele.outerHtml()));

        Elements trList = tableDoc.getElementsByTag("tr");

        Elements tdList = trList.get(0).getElementsByTag("td");

        //创建表格

        XWPFTable xwpfTable = doc.insertNewTable(run, trList.size(), tdList.size());

        //设置样式

        TableTools.widthTable(xwpfTable, MiniTableRenderData.WIDTH_A4_FULL, tdList.size());

        TableTools.borderTable(xwpfTable, 4);

        //写入表格行和列内容

        Map<String, Boolean>[][] array = new Map[trList.size()][tdList.size()];

        for (int row = 0; row < trList.size(); row++) {

            Element trElement = trList.get(row);

            Elements tds = trElement.getElementsByTag("td");

            for (int col = 0; col < tds.size(); col++) {

                Element colElement = tds.get(col);

                String colspan = colElement.attr("colspan");

                String rowspan = colElement.attr("rowspan");

                String style = colElement.attr("style");

                StringBuilder styleSB = new StringBuilder();

                if (!StringUtils.isEmpty(colspan)) {

                    int colCount = Integer.parseInt(colspan);

                    for (int i = 0; i < colCount - 1; i++) {

                        array[row][col + i + 1] = new HashMap<String, Boolean>();

                        array[row][col + i + 1].put("mergeCol", true);

                    }

                }

                if (!StringUtils.isEmpty(rowspan)) {

                    int rowCount = Integer.parseInt(rowspan);

                    for (int i = 0; i < rowCount - 1; i++) {

                        array[row + i + 1][col] = new HashMap<String, Boolean>();

                        array[row + i + 1][col].put("mergeRow", true);

                    }

                }

                XWPFTableCell tableCell = xwpfTable.getRow(row).getCell(col);

                if (StringUtils.isEmpty(colspan)) {

                    if (col == 0) {

                        if (tableCell.getCTTc().getTcPr() == null) {

                            tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.RESTART);

                        } else {

                            if (tableCell.getCTTc().getTcPr().getHMerge() == null) {

                                tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.RESTART);

                            } else {

                                tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.RESTART);

                            }

                        }

                    } else {

                        if (array[row][col] != null && array[row][col].get("mergeCol") != null && array[row][col].get("mergeCol")) {

                            if (tableCell.getCTTc().getTcPr() == null) {

                                tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.CONTINUE);

                            } else {

                                if (tableCell.getCTTc().getTcPr().getHMerge() == null) {

                                    tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.CONTINUE);

                                } else {

                                    tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.CONTINUE);

                                }

                            }

                            continue;

                        } else {

                            if (tableCell.getCTTc().getTcPr() == null) {

                                tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.RESTART);

                            } else {

                                if (tableCell.getCTTc().getTcPr().getHMerge() == null) {

                                    tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.RESTART);

                                } else {

                                    tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.RESTART);

                                }

                            }

                        }

                    }

                } else {

                    if (tableCell.getCTTc().getTcPr() == null) {

                        tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.RESTART);

                    } else {

                        if (tableCell.getCTTc().getTcPr().getHMerge() == null) {

                            tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.RESTART);

                        } else {

                            tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.RESTART);

                        }

                    }

                }

                if (StringUtils.isEmpty(rowspan)) {

                    if (array[row][col] != null && array[row][col].get("mergeRow") != null && array[row][col].get("mergeRow")) {

                        if (tableCell.getCTTc().getTcPr() == null) {

                            tableCell.getCTTc().addNewTcPr().addNewVMerge().setVal(STMerge.CONTINUE);

                        } else {

                            if (tableCell.getCTTc().getTcPr().getVMerge() == null) {

                                tableCell.getCTTc().getTcPr().addNewVMerge().setVal(STMerge.CONTINUE);

                            } else {

                                tableCell.getCTTc().getTcPr().getVMerge().setVal(STMerge.CONTINUE);

                            }

                        }

                        continue;

                    } else {

                        if (tableCell.getCTTc().getTcPr() == null) {

                            tableCell.getCTTc().addNewTcPr().addNewVMerge().setVal(STMerge.RESTART);

                        } else {

                            if (tableCell.getCTTc().getTcPr().getVMerge() == null) {

                                tableCell.getCTTc().getTcPr().addNewVMerge().setVal(STMerge.RESTART);

                            } else {

                                tableCell.getCTTc().getTcPr().getVMerge().setVal(STMerge.RESTART);

                            }

                        }

                    }

                } else {

                    if (tableCell.getCTTc().getTcPr() == null) {

                        tableCell.getCTTc().addNewTcPr().addNewVMerge().setVal(STMerge.RESTART);

                    } else {

                        if (tableCell.getCTTc().getTcPr().getVMerge() == null) {

                            tableCell.getCTTc().getTcPr().addNewVMerge().setVal(STMerge.RESTART);

                        } else {

                            tableCell.getCTTc().getTcPr().getVMerge().setVal(STMerge.RESTART);

                        }

                    }

                }

                tableCell.removeParagraph(0);

                XWPFParagraph paragraph = tableCell.addParagraph();

                paragraph.setStyle(styleSB.toString());

                if (!StringUtils.isEmpty(style) && style.contains("text-align:center")) {

                    paragraph.setAlignment(ParagraphAlignment.CENTER);

                }

                parseHtmlToWord(colElement, doc, paragraph, true);

            }

        }

    }

    /**

 * 简化html中的表格dom

 * @author xuwangcheng

 * @date 2019/7/29 18:39

 * @param tableContent tableContent

 * @return {@link String}

 */

    private static String simplifyTable(String tableContent) {

        if (StringUtils.isEmpty(tableContent)) {

            return null;

        }

        org.jsoup.nodes.Document tableDoc = Jsoup.parse(tableContent);

        Elements trElements = tableDoc.getElementsByTag("tr");

        if (trElements != null) {

            Iterator<Element> eleIterator = trElements.iterator();

            Integer rowNum = 0;

            // 针对于colspan操作

            while (eleIterator.hasNext()) {

                rowNum++;

                Element trElement = eleIterator.next();

                //去除所有样式

                trElement.removeAttr("class");

                Elements tdElements = trElement.getElementsByTag("td");

                List<Element> tdEleList = covertElements2List(tdElements);

                for (int i = 0; i < tdEleList.size(); i++) {

                    Element curTdElement = tdEleList.get(i);

                    //去除所有样式

                    curTdElement.removeAttr("class");

                    Element ele = curTdElement.clone();

                    String colspanValStr = curTdElement.attr("colspan");

                    if (!StringUtils.isEmpty(colspanValStr)) {

                        ele.removeAttr("colspan");

                        Integer colspanVal = Integer.parseInt(colspanValStr);

                        for (int k = 0; k < colspanVal - 1; k++) {

                            curTdElement.after(ele.outerHtml());

                        }

                    }

                }

            }

            // 针对于rowspan操作

            List<Element> trEleList = covertElements2List(trElements);

            Element firstTrEle = trElements.first();

            Elements tdElements = firstTrEle.getElementsByTag("td");

            Integer tdCount = tdElements.size();

            //获取该列下所有单元格

            for (int i = 0; i < tdElements.size(); i++) {

                for (Element trElement : trEleList) {

                    List<Element> tdElementList = covertElements2List(trElement.getElementsByTag("td"));

                    try {

                        tdElementList.get(i);

                    } catch (Exception e) {

                        continue;

                    }

                    Node curTdNode = tdElementList.get(i);

                    Node cNode = curTdNode.clone();

                    String rowspanValStr = curTdNode.attr("rowspan");

                    if (!StringUtils.isEmpty(rowspanValStr)) {

                        cNode.removeAttr("rowspan");

                        Element nextTrElement = trElement.nextElementSibling();

                        Integer rowspanVal = Integer.parseInt(rowspanValStr);

                        for (int j = 0; j < rowspanVal - 1; j++) {

                            Node tempNode = cNode.clone();

                            List<Node> nodeList = new ArrayList<Node>();

                            nodeList.add(tempNode);

                            if (j > 0) {

                                nextTrElement = nextTrElement.nextElementSibling();

                            }

                            Integer indexNum = i + 1;

                            if (i == 0) {

                                indexNum = 0;

                            }

                            if (indexNum.equals(tdCount)) {

                                nextTrElement.appendChild(tempNode);

                            } else {

                                nextTrElement.insertChildren(indexNum, nodeList);

                            }

                        }

                    }

                }

            }

        }

        Element tableEle = tableDoc.getElementsByTag("table").first();

        String tableHtml = tableEle.outerHtml();

        return tableHtml;

    }

    /**

 * 转换Elements为list

 * @author xuwangcheng

 * @date 2019/7/29 18:40

 * @param curElements curElements

 * @return {@link List}

 */

    private static List<Element> covertElements2List(Elements curElements){

        List<Element> elementList = new ArrayList<Element>();

        Iterator<Element> eleIterator = curElements.iterator();

        while(eleIterator.hasNext()){

            Element curlement = eleIterator.next();

            elementList.add(curlement);

        }

        return elementList;

    }

}
正文到此结束