Build a One‑Click Web Document Converter with Python & Flask
This tutorial walks you through building a web‑based universal document converter using Python and Flask, enabling batch conversion of PDF, Word, PPT, and Excel files into Markdown, with automatic packaging, Chinese support, simple deployment without command‑line skills, and practical use‑case examples.
Overview
This guide shows how to create a web‑based universal document converter with Python and Flask. The tool converts PDF, Word, PPT, and Excel files to Markdown, supports batch processing, Chinese characters, and provides one‑click download of the results.
Features
PDF → Markdown
Word (.docx) → Markdown
PPT (.pptx) → Markdown
Excel (.xlsx) → Markdown tables
Batch processing, automatic zipping, and download
No command‑line required; suitable for non‑technical users
Technology Stack
Python, Flask, marker‑pdf, python‑docx, python‑pptx, pandas, openpyxl, tabulate, markdownify, beautifulsoup4, and other standard libraries.
Project Structure
pdf2md-web/
├── app.py # Flask main program
├── config.py # Configuration
├── utils/
│ ├── converter.py
│ ├── excel_to_md.py
│ ├── word_to_md.py
│ ├── ppt_to_md.py
│ └── zip_utils.py
├── uploads/ # Uploaded files
├── outputs/ # Converted Markdown files
├── static/style.css
├── templates/
│ ├── index.html
│ └── result.html
└── requirements.txtInstallation
Install dependencies with pip install -r requirements.txt. Note that marker-pdf[all] pulls PyTorch and may take 5‑10 minutes on the first install.
Core Code
config.py
import os
from pathlib import Path
BASE_DIR = Path(__file__).parent
class Config:
UPLOAD_FOLDER = BASE_DIR / "uploads"
OUTPUT_FOLDER = BASE_DIR / "outputs"
MAX_CONTENT_LENGTH = 100 * 1024 * 1024 # 100MB
ALLOWED_EXTENSIONS = {'pdf', 'docx', 'pptx', 'xlsx', 'xls'}
@staticmethod
def init_app(app):
for folder in [Config.UPLOAD_FOLDER, Config.OUTPUT_FOLDER]:
folder.mkdir(exist_ok=True)utils/excel_to_md.py
import pandas as pd
from tabulate import tabulate
def excel_to_markdown(excel_path, output_md_path):
try:
excel_file = pd.ExcelFile(excel_path)
md_lines = []
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(excel_path, sheet_name=sheet_name)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
md_lines.append(f"## 表格: {sheet_name}")
md_lines.append(md_table)
md_lines.append("")
md_text = "
".join(md_lines)
with open(output_md_path, 'w', encoding='utf-8') as f:
f.write(md_text)
return True, None
except Exception as e:
return False, str(e)utils/converter.py
from pathlib import Path
from marker.converters.pdf import convert_pdf
from utils.word_to_md import word_to_markdown
from utils.ppt_to_md import pptx_to_markdown
from utils.excel_to_md import excel_to_markdown
def convert_document_to_markdown(input_path, output_folder):
input_path = Path(input_path)
output_folder = Path(output_folder)
output_folder.mkdir(exist_ok=True)
ext = input_path.suffix.lower()
md_file = output_folder / f"{input_path.stem}.md"
try:
if ext == '.pdf':
full_text, _, _ = convert_pdf(str(input_path))
md_file.write_text(full_text, encoding='utf-8')
return str(md_file), None
elif ext == '.docx':
success, error = word_to_markdown(str(input_path), str(md_file))
return (str(md_file), None) if success else (None, error)
elif ext == '.pptx':
success, error = pptx_to_markdown(str(input_path), str(md_file))
return (str(md_file), None) if success else (None, error)
elif ext in ['.xlsx', '.xls']:
success, error = excel_to_markdown(str(input_path), str(md_file))
return (str(md_file), None) if success else (None, error)
else:
return None, f"Unsupported format: {ext}"
except Exception as e:
return None, f"Conversion error: {str(e)}"
def batch_convert_documents(input_folder, output_folder):
input_folder = Path(input_folder)
success_list = []
failed_list = []
for file_path in input_folder.iterdir():
if file_path.suffix.lower() in ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']:
_, error = convert_document_to_markdown(file_path, output_folder)
if error:
failed_list.append(f"{file_path.name}: {error}")
else:
success_list.append(file_path.name)
return success_list, failed_listapp.py
from flask import Flask, request, render_template, send_file, flash, redirect, url_for
from werkzeug.utils import secure_filename
from config import Config
from utils.converter import batch_convert_documents
from utils.zip_utils import create_zip_file
app = Flask(__name__)
app.config.from_object(Config)
app.secret_key = 'your-secret-key'
Config.init_app(app)
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
for f in Config.OUTPUT_FOLDER.glob('*.md'):
f.unlink()
files = request.files.getlist('files')
if not files or all(f.filename == '' for f in files):
flash('No files selected')
return redirect(request.url)
uploaded_paths = []
for file in files:
if file and file.filename.endswith(('.pdf', '.docx', '.pptx', '.xlsx', '.xls')):
filename = secure_filename(file.filename)
file_path = Config.UPLOAD_FOLDER / filename
file.save(str(file_path))
uploaded_paths.append(file_path)
if not uploaded_paths:
flash('No supported file types uploaded')
return redirect(request.url)
success_list, failed_list = batch_convert_documents(Config.UPLOAD_FOLDER, Config.OUTPUT_FOLDER)
return render_template('result.html', success=success_list, failed=failed_list)
return render_template('index.html')
@app.route('/download_zip')
def download_zip():
md_files = list(Config.OUTPUT_FOLDER.glob('*.md'))
if not md_files:
return "No files to download", 404
file_names = [f.name for f in md_files]
zip_buffer = create_zip_file(Config.OUTPUT_FOLDER, file_names)
return send_file(zip_buffer, mimetype='application/zip', as_attachment=True, download_name='converted_markdown_files.zip')
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)Frontend Page (templates/index.html)
<!DOCTYPE html>
<html>
<head>
<title>📄 Universal Document to Markdown</title>
<style>
body {font-family: -apple-system, sans-serif; background:#f5f7fa;}
.container {max-width:800px; margin:40px auto; padding:30px; background:white; border-radius:12px;}
input[type="file"] {width:100%; padding:10px; border:2px dashed #3498db; border-radius:8px;}
button {background:#3498db; color:white; border:none; padding:12px 24px; border-radius:8px; cursor:pointer;}
button:hover {background:#2980b9;}
</style>
</head>
<body>
<div class="container">
<h1>🚀 Universal Document to Markdown</h1>
<p>Supports PDF / Word / PPT / Excel batch conversion</p>
<form method="post" enctype="multipart/form-data">
<div>
<label>Select files (supports .pdf .docx .pptx .xlsx):</label>
<input type="file" name="files" accept=".pdf,.docx,.pptx,.xlsx,.xls" multiple>
</div>
<button type="submit">🚀 Start Conversion</button>
</form>
</div>
</body>
</html>How to Run
# 1. Create project directory
mkdir pdf2md-web && cd pdf2md-web
# 2. Add all files as shown above
# 3. Install dependencies
pip install -r requirements.txt
# 4. Start the service
python app.py
# Access the application at http://localhost:5000Demo
Upload a folder containing PDF, Word, and Excel files; the system automatically detects each format, converts them to Markdown, packages the results into converted_markdown_files.zip, and provides a one‑click download.
Applicable Scenarios
Knowledge‑base construction: unify historical documents into Markdown.
AI training data preparation: clean document content for models.
Enterprise internal tools: usable by non‑technical staff.
Blog writing: quickly extract content from documents.
Conclusion
This tool has been used internally for three months, saving about two hours of document‑processing time per day. Combining Python’s power with AI makes office automation simple, and anyone can set it up.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
