Java word转为html 两种方式

滴滴滴上重点。。。

方式一：使用apache提供的工具包poi，poi使用的是4.1.2版本

缺点：对字体样式处理不精确；wmf公式图片部分转换不精确，本文档只支持doc格式

优点：转换速度相对很快，本地也方便调试

方式二：使用libreoffice，使用的是7.5版本

地址：下载 LibreOffice | LibreOffice 简体中文官方网站 - 自由免费的办公套件

Linux安装libreoffice案例：linux centos7工具安装之 libreOffice篇 libreOffice安装教程_centos7 安装libreoffice_the_bog的博客-CSDN博客

缺点：转换速度相对慢

优点：字体样式十分精确，本文档只支持doc，docx等等。转换pdf等相关命令百度获取

废话不多说直接上代码！！！

方式一代码实现：

相关jar包地址：

org.apache.poi

poi

4.1.2

org.apache.poi

poi-scratchpad

4.1.2

org.apache.poi

poi-ooxml

4.1.2

org.jsoup

jsoup

1.9.2

org.apache.xmlgraphics

batik-codec

1.7

net.arnx

wmf2svg

0.9.5

package cn.hls.winner.winner_problem_manage.utils;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.apache.poi.util.IOUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Attributes;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.util.FileCopyUtils;

import org.springframework.web.multipart.MultipartFile;

import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import java.io.*;

import java.util.ArrayList;

import java.util.List;

import java.util.UUID;

/**

* @author lhz

* @description TODO

* @date 2023/9/18 10:14

public class Word2003Util {

private static final Logger logger = LoggerFactory.getLogger(Word2003Util.class);

/**

* @param multipartFile 上传的文件

* @param htmlFile html上传路径

* @param htmlFileImgUrl html图片上传路径

* @param wordFileUrl word上传路径

* @return

public static String word2003ToHtml(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl) {

// 需要判断文件是否为doc,docx

if (multipartFile == null) {

return "word文档上传为空！";

}

if (multipartFile.getOriginalFilename().endsWith("docx")) {

return "word文档格式有误，请上传doc格式的！";

}

logger.info("***** word2003ToHtml start file:{}", multipartFile);

//返回服务器代理地址

String htmlUrl = "";

//随机命名html文件

String uuid = UUID.randomUUID().toString();

String htmlFileName = uuid + "." + "html";

logger.info("==== 初始化====（htmlFileName）{参数} " + htmlFileName);

try {

//上传服务器的图片本地地址

logger.info("==== htmlFile{参数} ====" + htmlFile);

//nginx转发后的图片地址

logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl);

//生成网页的文件夹地址

String htmlFileUrl = htmlFile + uuid + "/";

logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl);

//上传文件到服务器

boolean flag = upload(multipartFile, wordFileUrl, uuid);

if (!flag) {

return "word文档上传失败！";

}

logger.info("===== word文档上传成功！====");

//获取文件名称

String name = multipartFile.getOriginalFilename();

String suffix = name.substring(name.lastIndexOf("."));//.后缀名

String filePath = wordFileUrl + uuid + suffix;

logger.info("==== filePath ====" + filePath);

File file = new File(filePath);

// 1) 加载word文档生成 HWPFDocument对象

InputStream inputStream = new FileInputStream(file);

HWPFDocument wordDocument = new HWPFDocument(inputStream);

WordToHtmlConverter wordToHtmlConverter =

new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

//图片地址

String fileImg = htmlFileUrl + "images/";

File htmlFile1 = new File(htmlFileUrl);

if (!htmlFile1.exists()) {

//创建

if (htmlFile1.mkdirs()) {

logger.info("创建" + htmlFileUrl + "成功");

} else {

logger.info("创建" + htmlFileUrl + "成功");

}

//html代理地址

htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;

//html生成路径

htmlFileName = htmlFileUrl + htmlFileName;

logger.info("==== htmlFileName{ html ======== 输出地址} " + htmlFileName);

//设置图片存放的位置

String finalFileImg = fileImg;

final int[] index = {1};

//处理图片地址

wordToHtmlConverter.setPicturesManager(new PicturesManager() {

public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {

File imgPath = new File(finalFileImg);

if (!imgPath.exists()) {//图片目录不存在则创建

imgPath.mkdirs();

}

String extension = pictureType.getExtension();

//随机生成图片名称

suggestedName = finalFileImg + "image" + index[0] + "." + extension;

File file = new File(suggestedName);

OutputStream os = null;

try {

os = new FileOutputStream(file);

os.write(content);

os.close();

//处理wmf公式图片

// if (extension.equals("wmf") || extension.equals("svg")) {

// if (extension.equals("wmf")) {

// String svgFile = suggestedName.substring(0,

// suggestedName.lastIndexOf(".wmf"))

// + ".svg";

// SvgToPngUtil.wmfToSvg(suggestedName, svgFile);

// }

// String suggestedNameSVG = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".svg";

String s = SvgToPngUtil.readToString(suggestedNameSVG);

String suggestedNamePng = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".png";

SvgToPngUtil.convertToPng(s, suggestedNamePng);

String s1 = SvgToPngUtil.GetImageStr(suggestedNameSVG);

// //删除无用图片

deleteFile(suggestedNameSVG, suggestedName);

// suggestedName = suggestedNameSVG;

// }

} catch (FileNotFoundException e) {

throw new RuntimeException(e);

} catch (IOException e) {

throw new RuntimeException(e);

}

//这里可以指定word文档中图片的路径。

String imgUlr = suggestedName.replace(htmlFile, htmlFileImgUrl);

index[0]++;

return imgUlr;

}

});

wordToHtmlConverter.processDocument(wordDocument);

Document htmlDocument = wordToHtmlConverter.getDocument();

OutputStream outputStream = new FileOutputStream(htmlFileName);

DOMSource domSource = new DOMSource(htmlDocument);

StreamResult streamResult = new StreamResult(outputStream);

TransformerFactory factory = TransformerFactory.newInstance();

Transformer serializer = factory.newTransformer();

serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

serializer.setOutputProperty(OutputKeys.INDENT, "yes");

serializer.setOutputProperty(OutputKeys.METHOD, "html");

serializer.transform(domSource, streamResult);

outputStream.close();

logger.info("===== 网页样式转换开始 ====");

String htmlContents = readHtml(htmlFileName);

FileCopyUtils.copy(htmlContents.getBytes("utf-8"), new File(htmlFileName));

logger.info("===== 网页样式转换完成 ====");

} catch (Exception e) {

logger.error("word2003ToHtml====异常");

logger.error(e.getMessage());

throw new RuntimeException(e);

}

return htmlUrl;

}

//获取网页内容

public static String readHtml(String htmlFileName) throws Exception {

StringBuilder htmlContents1 = new StringBuilder();

String htmlContents = "";

//读图网页内容

BufferedReader buf = new BufferedReader(

new InputStreamReader(new FileInputStream(htmlFileName), "utf-8"));

String c = "";

while ((c = buf.readLine()) != null) {

htmlContents1.append(c + "\n");

}

buf.close();

htmlContents = htmlContents1.toString();

htmlContents = htmlContents.replace("hyphenate:auto;font-family:Times New Roman;", "hyphenate:auto;font-family:宋体;").replace("vertical-align:text-bottom;", "vertical-align: middle;").replace("’","'").replace("’","'");

org.jsoup.nodes.Document document = Jsoup.parse(htmlContents);

formatHtml(document);

htmlContents = document.toString();

return htmlContents;

}

//网页字体样式

public static void formatHtml(org.jsoup.nodes.Document document) {

Elements elements = document.getAllElements();

String title = document.title();

logger.info("==== formatHtml ====title"+title);

for (Element element : elements) {

if ("main".equals(element.className())) {

continue;

}

if (title.contains("物理") || title.contains("数学") || title.contains("化学")) {

if (element.hasClass("s1")) {

element.attr("style", "font-family:Times New Roman;" + element.attr("style"));

}

String[] attrs = element.attr("style").split(";");

List attrList = new ArrayList();

for (String attr : attrs) {

if (attr.contains("font-family")) {

attrList.add(attr);

}

//将标签里的class属性b1 b2去掉

Elements bodys = element.getElementsByTag("body");

for(Element body : bodys){

System.out.println("=======className:" + body.className() + "==========");

if("b1 b2".equals(body.className())){

body.attr("class","");

}

public static void deleteFile(String... imgUrl) {

for (String s : imgUrl) {

File file = new File(s);

try {

if (file.isFile()) {

// 删除文件

if (file.delete()) {

logger.info("删除文件成功==== 名称为：" + file.getName());

} else {

}

} else {

}

} catch (Exception e) {

logger.error("====== 删除图片失败 ======" + e.getMessage());

throw new RuntimeException();

}

/**

* @param file 文件

* @param htmlFile 文件上传地址

* @param fileName 文件名称

* @return

public static boolean upload(MultipartFile file, String htmlFile, String fileName) {

InputStream is = null;

OutputStream os = null;

try {

File file1 = new File(htmlFile);

if (!file1.exists()) {

file1.mkdirs();

}

String name = file.getOriginalFilename();

String suffix = name.substring(name.lastIndexOf("."));//.后缀名

is = file.getInputStream();

os = new FileOutputStream(htmlFile + fileName + suffix);

//数据对拷

IOUtils.copy(is, os);

logger.info("==== 文件写入成功！====");

} catch (IOException e) {

logger.error("===== 文件上传失败 ====" + e.getMessage());

return false;

} finally {

if (null != is) {

try {

is.close();

} catch (IOException e) {

throw new RuntimeException(e);

}

if (null != os) {

try {

os.close();

} catch (IOException e) {

throw new RuntimeException(e);

}

return true;

}

方式二代码实现：

package com.hls.poi.service;

import com.hls.poi.controller.WordToHtmlController;

import org.apache.poi.util.IOUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.web.multipart.MultipartFile;

import java.io.*;

import java.util.UUID;

public class LibreOfficeCommandWordService {

private static final Logger logger = LoggerFactory.getLogger(WordToHtmlController.class);

/**

* /opt/libreoffice7.5/program/soffice --headless --invisible --convert-to pdf /opt/a/1.docx --outdir /opt/a/

* –convert-to pdf 后面的 /opt/a/1.docx 为原文件路径

* –outdir /opt/a/（转换后文件存放目录）

* soffice --headless --invisible --convert-to html:HTML ffc75d91-3594-451d-a55f-a941325bc380.doc --outdir mmm

//需要根据实际情况，查找LibreOffice安装的实际目录，

//Mac下是默认安装到/usr/local/bin，

//CentOS下默认安装在/usr/bin

private final static String sofficeDir = "/opt/libreoffice7.6/program/";

/**

* @param multipartFile 上传的文件

* @param htmlFile html上传路径

* @param htmlFileImgUrl html图片上传路径

* @param wordFileUrl word上传路径

* @param sofficeDir libreOffice安装地址

* @throws Exception

public String word2html(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl, String sofficeDir) throws Exception {

try {

logger.info("exec command:[{}]\noutput: [{}]", "进入word2pdf{} 方法");

// 需要判断文件是否为doc,docx

if (multipartFile == null) {

return "word文档上传为空！";

}

//返回服务器代理地址

String htmlUrl = "";

//随机命名html文件

String uuid = UUID.randomUUID().toString();

String htmlFileName = uuid + "." + "html";

logger.info("==== 初始化====（htmlFileName）{参数} " + htmlFileName);

//上传服务器的图片本地地址

logger.info("==== htmlFile{参数} ====" + htmlFile);

//nginx转发后的图片地址

logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl);

//生成网页的文件夹地址

String htmlFileUrl = htmlFile + uuid + "/";

logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl);

//上传文件到服务器

boolean flag = upload(multipartFile, wordFileUrl, uuid);

if (!flag) {

return "word文档上传失败！";

}

logger.info("===== word文档上传成功！====");

//获取文件名称

String name = multipartFile.getOriginalFilename();

String suffix = name.substring(name.lastIndexOf("."));//.后缀名

//上传后word文档路径 /home/winnersoft/date/tomcat/html-root/office/word/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.doc

String inPath = wordFileUrl + uuid + suffix;

logger.info("==== inPath ====" + inPath);

if (!new File(inPath).exists()) {

return "word文档不存在！";

}

//图片地址

File htmlFile1 = new File(htmlFileUrl);

if (!htmlFile1.exists()) {

//创建

if (htmlFile1.mkdirs()) {

logger.info("创建" + htmlFileUrl + "成功");

} else {

logger.info("创建" + htmlFileUrl + "成功");

}

//html代理地址 //http://172.18.222.25:82/office/html/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.html

htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;

//html生成路径 /home/winnersoft/date/tomcat/html-root/office/html/af7ac82f-71bc-498c-8866-8bf7ef325345/

htmlFileName = htmlFileUrl;

logger.info("==== outPath{ html ======== 输出地址} " + htmlFileName);

//设置图片存放的位置

// String command = String.format("%s/soffice --convert-to pdf:writer_pdf_Export %s --outdir %s", sofficeDir, inPath, outPath);

String command = String.format("%s/soffice --headless --invisible --convert-to html:HTML %s --outdir %s", sofficeDir, inPath, htmlFileName);

logger.info("command==================================" + command);

String output = this.executeCommand(command);

logger.info("exec command:[{}]\noutput: [{}]", command, output);

return htmlUrl;

} catch (IOException e) {

logger.error("io异常"+e.getMessage());

throw new RuntimeException(e);

} catch (InterruptedException e) {

throw new RuntimeException(e);

}

protected String executeCommand(String command) throws IOException, InterruptedException {

logger.info("executeCommand{} 执行转化");

StringBuffer output = new StringBuffer();

Process p;

p = Runtime.getRuntime().exec(command);

p.waitFor();

try (

InputStreamReader inputStreamReader = new InputStreamReader(p.getInputStream(), "UTF-8");

BufferedReader reader = new BufferedReader(inputStreamReader)

) {

String line = "";

while ((line = reader.readLine()) != null) {

output.append(line + "\n");

}

// 销毁子进程

p.destroy();

return output.toString();

}

/**

* @param file 文件

* @param htmlFile 文件上传地址

* @param fileName 文件名称

* @return

public static boolean upload(MultipartFile file, String htmlFile, String fileName) {

InputStream is = null;

OutputStream os = null;

try {

File file1 = new File(htmlFile);

if (!file1.exists()) {

file1.mkdirs();

}

String name = file.getOriginalFilename();

String suffix = name.substring(name.lastIndexOf("."));//.后缀名

is = file.getInputStream();

os = new FileOutputStream(htmlFile + fileName + suffix);

//数据对拷

IOUtils.copy(is, os);

logger.info("==== 文件写入成功！====");

} catch (IOException e) {

logger.error("===== 文件上传失败 ====" + e.getMessage());

return false;

} finally {

if (null != is) {

try {

is.close();

} catch (IOException e) {

throw new RuntimeException(e);

}

if (null != os) {

try {

os.close();

} catch (IOException e) {

throw new RuntimeException(e);

}

return true;

}

文章链接

评论可见，请评论后查看内容，谢谢！！！

您阅读本篇文章共花了：

金钥匙

Java word转为html 两种方式

postgresql Oracle的学习心得和知识总结（十五）|Oracle数据库Real Application Testing之DBMS

前端学习 visual studio code HTML基础知识详解（上）（如果想知道html的全部基础知识点，那么只看这一篇就足够了！）

发表评论取消回复

金钥匙

Java word转为html 两种方式

postgresql Oracle的学习心得和知识总结（十五）|Oracle数据库Real Application Testing之DBMS

前端 学习 visual studio code HTML基础知识详解（上）（如果想知道html的全部基础知识点，那么只看这一篇就足够了！）

相关文章

发表评论取消回复

前端学习 visual studio code HTML基础知识详解（上）（如果想知道html的全部基础知识点，那么只看这一篇就足够了！）