feat: markitdown implementation (#486)

* feat: markitdown implementation

* fix: code review and docker file:

* fix: add markitdown PATH in container

* fix: feedback changes

* en: readme changed
This commit is contained in:
Sahil sharma
2025-12-27 12:59:17 +05:30
committed by GitHub
parent 8af8e59b4f
commit f2a92aaf39
4 changed files with 52 additions and 0 deletions

View File

@@ -74,9 +74,16 @@ RUN apt-get update && apt-get install -y \
texlive-latex-extra \
texlive-latex-recommended \
texlive-xetex \
python3 \
python3-pip \
pipx \
--no-install-recommends \
&& pipx install "markitdown[all]" \
&& rm -rf /var/lib/apt/lists/*
# Add pipx bin directory to PATH
ENV PATH="/root/.local/bin:${PATH}"
# Install VTracer binary
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "aarch64" ]; then \

View File

@@ -45,6 +45,7 @@ A self-hosted online file converter. Supports over a thousand different formats.
| [FFmpeg](https://ffmpeg.org/) | Video | ~472 | ~199 |
| [Potrace](https://potrace.sourceforge.net/) | Raster to vector | 4 | 11 |
| [VTracer](https://github.com/visioncortex/vtracer) | Raster to vector | 8 | 1 |
| [Markitdown](https://github.com/microsoft/markitdown) | Documents | 6 | 1 |
<!-- many ffmpeg fileformats are duplicates -->

View File

@@ -23,6 +23,7 @@ import { convert as convertresvg, properties as propertiesresvg } from "./resvg"
import { convert as convertImage, properties as propertiesImage } from "./vips";
import { convert as convertVtracer, properties as propertiesVtracer } from "./vtracer";
import { convert as convertxelatex, properties as propertiesxelatex } from "./xelatex";
import { convert as convertMarkitdown, properties as propertiesMarkitdown } from "./markitdown";
// This should probably be reconstructed so that the functions are not imported instead the functions hook into this to make the converters more modular
@@ -127,6 +128,10 @@ const properties: Record<
properties: propertiesVtracer,
converter: convertVtracer,
},
markitDown: {
properties: propertiesMarkitdown,
converter: convertMarkitdown,
},
};
function chunks<T>(arr: T[], size: number): T[][] {

View File

@@ -0,0 +1,39 @@
import { execFile as execFileOriginal } from "node:child_process";
import { ExecFileFn } from "./types";
export const properties = {
from: {
document: ["pdf", "powerpoint", "excel", "docx", "pptx", "html"],
},
to: {
document: ["md"],
},
};
export async function convert(
filePath: string,
fileType: string,
convertTo: string,
targetPath: string,
options?: unknown,
execFile: ExecFileFn = execFileOriginal,
): Promise<string> {
return new Promise((resolve, reject) => {
execFile("markitdown", [filePath, "-o", targetPath], (err, stdout, stderr) => {
if (err) {
reject(`markitdown error: ${err}`);
return;
}
if (stdout) {
console.log(`stdout: ${stdout}`);
}
if (stderr) {
console.error(`stderr: ${stderr}`);
}
resolve("Done");
});
});
}