使用java将doc/docx转换成html
需要使用的maven依赖
<!--注意版本保持一致 poi poi-ooxml poi-scratchpad-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 操作doc ppt xls -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 操作docx pptx xlsx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.2</version>
</dependency>
工具类:
package com.bxkc.utils;
import cn.hutool.core.io.FileUtil;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
/**
* word文档转html工具类
*
* @author sgz
* @version 1.0.0
* @since 2023/7/5
*/
public class WordToHtmlUtil {
/**
* doc转换为html
*
* @param inputStream 输入流
* @return html文本
* @throws TransformerException TransformerException
* @throws IOException IOException
* @throws ParserConfigurationException ParserConfigurationException
*/
public static String convertDoc(InputStream inputStream) throws TransformerException, IOException, ParserConfigurationException {
HWPFDocument wordDocument = new HWPFDocument(inputStream);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.processDocument(wordDocument);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
// 写入到输出流中
ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
OutputStream outStream = new BufferedOutputStream(byteOut);
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
wordToHtmlConverter.processDocument(wordDocument);
// 有图片需要进行图片处理, 保存图片
// wordToHtmlConverter.setPicturesManager(new PicturesManager() {
// public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
// return "test/" + suggestedName;
// }
// });
// wordToHtmlConverter.processDocument(wordDocument);
// // 保存图片
// List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();
// if (pics != null) {
// for (int i = 0; i < pics.size(); i++) {
// Picture pic = (Picture) pics.get(i);
// System.out.println();
// try {
// pic.writeImageContent(new FileOutputStream(tempPath + pic.suggestFullFileName()));
// } catch (FileNotFoundException e) {
// e.printStackTrace();
// }
// }
// }
String byteOutString = byteOut.toString();
inputStream.close();
wordDocument.close();
outStream.close();
byteOut.close();
return byteOutString;
}
/**
* doc转换为html
*
* @param inputStream 输入流
* @param outFile 输出文件
* @throws TransformerException TransformerException
* @throws IOException IOException
* @throws ParserConfigurationException ParserConfigurationException
*/
public static void convertDoc(InputStream inputStream, File outFile) throws TransformerException, IOException, ParserConfigurationException {
String htmlDoc = convertDoc(inputStream);
FileUtil.writeString(htmlDoc, outFile, "utf-8");
}
/**
* docx转换为html
*
* @param inputStream 输入流
* @return html文本
* @throws IOException IOException
*/
public static String convertDocx(InputStream inputStream) throws IOException {
XWPFDocument document = new XWPFDocument(inputStream);
XHTMLOptions options = XHTMLOptions.create().indent(4);
// 导出图片
// File imageFolder = new File(tempPath);
// options.setExtractor(new FileImageExtractor(imageFolder));
// // URI resolver
// options.URIResolver(new FileURIResolver(imageFolder));
// 写入到输出流中
ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
OutputStream outStream = new BufferedOutputStream(byteOut);
XHTMLConverter.getInstance().convert(document, outStream, options);
String byteOutString = byteOut.toString();
inputStream.close();
outStream.close();
document.close();
return byteOutString;
}
/**
* docx转换为html
*
* @param inputStream 输入流
* @param outFile 输出文件
* @throws IOException IOException
*/
public static void convertDocx(InputStream inputStream, File outFile) throws IOException {
String htmlDoc = convertDocx(inputStream);
FileUtil.writeString(htmlDoc, outFile, "utf-8");
}
}
亲测可以使用
评论区