Golang实现PDF转Docx的实用方法

Golang实现PDF转Docx的实用方法大家好，你们好吗？有没有什么库可以实现将PDF文件转换为.doc或.docx格式？我快速谷歌搜索找到的那些库，功能是将不同格式转换为txt，或者从HTML转换为PDF，但没有从doc转换为PDF或反之的。你们有什么推荐吗？

此致，

Henrique

wuwangju 1楼

更多关于Golang实现PDF转Docx的实用方法的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html

对于PDF转Docx的需求，目前确实没有完美的原生Go库可以直接实现。不过可以通过以下几种方式实现：

方法1：使用外部工具（推荐）

通过Go调用成熟的命令行工具来实现转换：

package main

import (
    "fmt"
    "os/exec"
    "log"
)

// 使用LibreOffice进行转换
func convertPDFtoDocx(pdfPath, outputPath string) error {
    cmd := exec.Command("libreoffice", 
        "--headless", 
        "--convert-to", "docx", 
        "--outdir", outputPath, 
        pdfPath)
    
    output, err := cmd.CombinedOutput()
    if err != nil {
        return fmt.Errorf("转换失败: %v\n输出: %s", err, output)
    }
    
    return nil
}

// 使用qpdf和pandoc组合（需要安装这两个工具）
func convertPDFtoDocxWithPandoc(pdfPath, docxPath string) error {
    // 先将PDF转为中间格式
    txtCmd := exec.Command("pdftotext", pdfPath, "-")
    docxCmd := exec.Command("pandoc", "-f", "plain", "-t", "docx", "-o", docxPath)
    
    // 管道连接
    docxCmd.Stdin, _ = txtCmd.StdoutPipe()
    docxCmd.Stdout = nil
    docxCmd.Stderr = nil
    
    if err := docxCmd.Start(); err != nil {
        return err
    }
    
    if err := txtCmd.Run(); err != nil {
        return err
    }
    
    return docxCmd.Wait()
}

方法2：使用Go绑定现有库

通过cgo绑定成熟的C/C++库：

// 使用unipdf库提取文本，然后生成docx
package main

import (
    "github.com/unidoc/unipdf/v3/extractor"
    "github.com/unidoc/unipdf/v3/model"
    "github.com/unidoc/unipdf/v3/common/license"
    "github.com/xuri/excelize/v2"
    "fmt"
    "os"
)

func init() {
    // 设置许可证（需要注册获取）
    license.SetLicenseKey("your-license-key")
}

func pdfToText(pdfPath string) (string, error) {
    f, err := os.Open(pdfPath)
    if err != nil {
        return "", err
    }
    defer f.Close()
    
    pdfReader, err := model.NewPdfReader(f)
    if err != nil {
        return "", err
    }
    
    numPages, err := pdfReader.GetNumPages()
    if err != nil {
        return "", err
    }
    
    var fullText string
    for i := 1; i <= numPages; i++ {
        page, err := pdfReader.GetPage(i)
        if err != nil {
            continue
        }
        
        ex, err := extractor.New(page)
        if err != nil {
            continue
        }
        
        text, err := ex.ExtractText()
        if err != nil {
            continue
        }
        
        fullText += text + "\n"
    }
    
    return fullText, nil
}

// 将提取的文本保存为docx（简单格式）
func saveTextAsDocx(text, outputPath string) error {
    f := excelize.NewFile()
    
    // 创建一个工作表
    index := f.NewSheet("Sheet1")
    
    // 将文本写入单元格
    f.SetCellValue("Sheet1", "A1", text)
    
    // 设置活动工作表
    f.SetActiveSheet(index)
    
    // 保存为docx（实际上保存为xlsx，但格式类似）
    if err := f.SaveAs(outputPath); err != nil {
        return err
    }
    
    return nil
}

方法3：使用REST API服务

调用在线转换API：

package main

import (
    "bytes"
    "fmt"
    "io"
    "mime/multipart"
    "net/http"
    "os"
)

func convertViaAPI(pdfPath, apiKey string) ([]byte, error) {
    // 打开PDF文件
    file, err := os.Open(pdfPath)
    if err != nil {
        return nil, err
    }
    defer file.Close()
    
    // 创建multipart表单
    body := &bytes.Buffer{}
    writer := multipart.NewWriter(body)
    
    part, err := writer.CreateFormFile("file", "document.pdf")
    if err != nil {
        return nil, err
    }
    
    _, err = io.Copy(part, file)
    if err != nil {
        return nil, err
    }
    
    writer.WriteField("format", "docx")
    writer.Close()
    
    // 发送请求到转换API
    req, err := http.NewRequest("POST", 
        "https://api.convertapi.com/convert/pdf/to/docx", 
        body)
    if err != nil {
        return nil, err
    }
    
    req.Header.Set("Content-Type", writer.FormDataContentType())
    req.Header.Set("Authorization", "Bearer "+apiKey)
    
    client := &http.Client{}
    resp, err := client.Do(req)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()
    
    return io.ReadAll(resp.Body)
}

方法4：使用Docker容器

通过Go控制Docker运行转换工具：

package main

import (
    "context"
    "github.com/docker/docker/api/types"
    "github.com/docker/docker/api/types/container"
    "github.com/docker/docker/client"
    "io"
    "os"
)

func convertWithDocker(pdfPath, outputDir string) error {
    ctx := context.Background()
    cli, err := client.NewClientWithOpts(client.FromEnv)
    if err != nil {
        return err
    }
    
    // 拉取转换镜像
    reader, err := cli.ImagePull(ctx, "docker.io/libreoffice", types.ImagePullOptions{})
    if err != nil {
        return err
    }
    io.Copy(os.Stdout, reader)
    
    // 创建容器配置
    config := &container.Config{
        Image: "libreoffice",
        Cmd: []string{"--headless", "--convert-to", "docx", "/data/input.pdf"},
    }
    
    hostConfig := &container.HostConfig{
        Binds: []string{
            pdfPath + ":/data/input.pdf",
            outputDir + ":/data/output",
        },
    }
    
    // 创建并启动容器
    resp, err := cli.ContainerCreate(ctx, config, hostConfig, nil, nil, "")
    if err != nil {
        return err
    }
    
    if err := cli.ContainerStart(ctx, resp.ID, types.ContainerStartOptions{}); err != nil {
        return err
    }
    
    // 等待容器完成
    statusCh, errCh := cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning)
    select {
    case err := <-errCh:
        if err != nil {
            return err
        }
    case <-statusCh:
    }
    
    return nil
}

注意事项：

格式保持：PDF到Docx的转换很难完美保持原始格式
字体处理：嵌入字体可能无法正确转换
复杂布局：表格、多栏布局等可能无法准确转换
图像提取：PDF中的图像需要单独处理

最可靠的方法是使用方法1（LibreOffice），它提供了相对较好的格式保持能力。