Commit ·
fcfb4e7
0
Parent(s):
CorrSteer article v2 - fix theme toggle and hash URL
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +18 -0
- .gitignore +41 -0
- CHANGELOG.md +118 -0
- CONTRIBUTING.md +196 -0
- Dockerfile +45 -0
- LICENSE +33 -0
- NOTION_IMPORT.md +186 -0
- README.md +18 -0
- app/astro.config.mjs +80 -0
- app/package-lock.json +0 -0
- app/package.json +0 -0
- app/plugins/rehype/code-copy.mjs +94 -0
- app/plugins/rehype/post-citation.mjs +449 -0
- app/plugins/rehype/restore-at-in-code.mjs +22 -0
- app/plugins/rehype/wrap-outputs.mjs +38 -0
- app/plugins/rehype/wrap-tables.mjs +43 -0
- app/plugins/remark/ignore-citations-in-code.mjs +21 -0
- app/plugins/remark/output-container.mjs +23 -0
- app/plugins/remark/outputs-container.mjs +23 -0
- app/plugins/remark/unwrap-citation-links.mjs +57 -0
- app/postcss.config.mjs +14 -0
- app/public/data +1 -0
- app/public/hf-space-parent-listener.js +70 -0
- app/public/scripts/color-palettes.js +274 -0
- app/scripts/export-latex.mjs +358 -0
- app/scripts/export-pdf.mjs +483 -0
- app/scripts/generate-trackio-data.mjs +196 -0
- app/scripts/generate_ablation_data.py +132 -0
- app/scripts/generate_ablation_data_correct.py +157 -0
- app/scripts/generate_ablation_data_final.py +158 -0
- app/scripts/generate_ablation_data_fixed.py +131 -0
- app/scripts/jitter-trackio-data.mjs +129 -0
- app/scripts/latex-importer/README.md +169 -0
- app/scripts/latex-importer/bib-cleaner.mjs +104 -0
- app/scripts/latex-importer/filters/equation-ids.lua +134 -0
- app/scripts/latex-importer/index.mjs +138 -0
- app/scripts/latex-importer/latex-converter.mjs +330 -0
- app/scripts/latex-importer/mdx-converter.mjs +896 -0
- app/scripts/latex-importer/metadata-extractor.mjs +170 -0
- app/scripts/latex-importer/package-lock.json +0 -0
- app/scripts/latex-importer/package.json +0 -0
- app/scripts/latex-importer/post-processor.mjs +439 -0
- app/scripts/latex-importer/reference-preprocessor.mjs +239 -0
- app/scripts/notion-importer/.cursorignore +1 -0
- app/scripts/notion-importer/README.md +334 -0
- app/scripts/notion-importer/env.example +2 -0
- app/scripts/notion-importer/index.mjs +494 -0
- app/scripts/notion-importer/input/pages.json +3 -0
- app/scripts/notion-importer/mdx-converter.mjs +863 -0
- app/scripts/notion-importer/notion-converter.mjs +266 -0
.gitattributes
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.mov filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.avi filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
# the package and package lock should not be tracked
|
| 14 |
+
package.json -filter -diff -merge text
|
| 15 |
+
package-lock.json -filter -diff -merge text
|
| 16 |
+
# Notion imported images should NOT be in LFS (needed for Docker build)
|
| 17 |
+
app/src/content/assets/image/image_27877f1c*.png -filter -diff -merge text
|
| 18 |
+
app/scripts/notion-importer/output/** -filter -diff -merge text
|
.gitignore
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.so
|
| 5 |
+
.Python
|
| 6 |
+
env/
|
| 7 |
+
venv/
|
| 8 |
+
*.egg-info/
|
| 9 |
+
dist/
|
| 10 |
+
build/
|
| 11 |
+
*.egg
|
| 12 |
+
.idea/
|
| 13 |
+
.vscode/
|
| 14 |
+
.astro/
|
| 15 |
+
.claude/
|
| 16 |
+
*.swp
|
| 17 |
+
.DS_Store
|
| 18 |
+
# Node
|
| 19 |
+
node_modules/
|
| 20 |
+
*.log
|
| 21 |
+
*.env
|
| 22 |
+
*.cache
|
| 23 |
+
.notion-to-md
|
| 24 |
+
|
| 25 |
+
app/scripts/latex-to-mdx/output/
|
| 26 |
+
app/scripts/notion-importer/output/**/*
|
| 27 |
+
app/src/content/embeds/typography/generated
|
| 28 |
+
|
| 29 |
+
# PDF export
|
| 30 |
+
app/public/*.pdf
|
| 31 |
+
app/public/*.png
|
| 32 |
+
app/public/*.jpg
|
| 33 |
+
app/public/data/**/*
|
| 34 |
+
|
| 35 |
+
.astro/
|
| 36 |
+
|
| 37 |
+
# Template sync temporary directories
|
| 38 |
+
.template-sync/
|
| 39 |
+
.temp-*/
|
| 40 |
+
.backup-*/
|
| 41 |
+
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to the Research Article Template will be documented in this file.
|
| 4 |
+
|
| 5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
| 6 |
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
| 7 |
+
|
| 8 |
+
## [Unreleased]
|
| 9 |
+
|
| 10 |
+
### Added
|
| 11 |
+
- Initial open source release
|
| 12 |
+
- Comprehensive documentation
|
| 13 |
+
- Contributing guidelines
|
| 14 |
+
- License file
|
| 15 |
+
|
| 16 |
+
## [1.0.0] - 2024-12-19
|
| 17 |
+
|
| 18 |
+
### Added
|
| 19 |
+
- **Core Features**:
|
| 20 |
+
- Markdown/MDX-based writing system
|
| 21 |
+
- KaTeX mathematical notation support
|
| 22 |
+
- Syntax highlighting for code blocks
|
| 23 |
+
- Academic citations with BibTeX integration
|
| 24 |
+
- Footnotes and sidenotes system
|
| 25 |
+
- Auto-generated table of contents
|
| 26 |
+
- Interactive Mermaid diagrams
|
| 27 |
+
- Plotly.js and D3.js integration
|
| 28 |
+
- HTML embed support
|
| 29 |
+
- Gradio app embedding
|
| 30 |
+
- Dataviz color palettes
|
| 31 |
+
- Image optimization
|
| 32 |
+
- SEO-friendly structure
|
| 33 |
+
- Automatic PDF export
|
| 34 |
+
- Dark/light theme toggle
|
| 35 |
+
- Mobile-responsive design
|
| 36 |
+
- LaTeX import functionality
|
| 37 |
+
- Template synchronization system
|
| 38 |
+
|
| 39 |
+
- **Components**:
|
| 40 |
+
- Figure component with captions
|
| 41 |
+
- MultiFigure for image galleries
|
| 42 |
+
- Note component with variants
|
| 43 |
+
- Quote component
|
| 44 |
+
- Accordion for collapsible content
|
| 45 |
+
- Sidenote component
|
| 46 |
+
- Table of Contents
|
| 47 |
+
- Theme Toggle
|
| 48 |
+
- HTML Embed
|
| 49 |
+
- Raw HTML support
|
| 50 |
+
- SEO component
|
| 51 |
+
- Hero section
|
| 52 |
+
- Footer
|
| 53 |
+
- Full-width and wide layouts
|
| 54 |
+
|
| 55 |
+
- **Build System**:
|
| 56 |
+
- Astro 4.10.0 integration
|
| 57 |
+
- PostCSS with custom media queries
|
| 58 |
+
- Automatic compression
|
| 59 |
+
- Docker support
|
| 60 |
+
- Nginx configuration
|
| 61 |
+
- Git LFS support
|
| 62 |
+
|
| 63 |
+
- **Scripts**:
|
| 64 |
+
- PDF export functionality
|
| 65 |
+
- LaTeX to MDX conversion
|
| 66 |
+
- Template synchronization
|
| 67 |
+
- Font SVG generation
|
| 68 |
+
- TrackIO data generation
|
| 69 |
+
|
| 70 |
+
- **Documentation**:
|
| 71 |
+
- Getting started guide
|
| 72 |
+
- Writing best practices
|
| 73 |
+
- Component reference
|
| 74 |
+
- LaTeX conversion guide
|
| 75 |
+
- Interactive examples
|
| 76 |
+
|
| 77 |
+
### Technical Details
|
| 78 |
+
- **Framework**: Astro 4.10.0
|
| 79 |
+
- **Styling**: PostCSS with custom properties
|
| 80 |
+
- **Math**: KaTeX 0.16.22
|
| 81 |
+
- **Charts**: Plotly.js 3.1.0, D3.js 7.9.0
|
| 82 |
+
- **Diagrams**: Mermaid 11.10.1
|
| 83 |
+
- **Node.js**: >=20.0.0
|
| 84 |
+
- **License**: CC-BY-4.0
|
| 85 |
+
|
| 86 |
+
### Browser Support
|
| 87 |
+
- Chrome (latest)
|
| 88 |
+
- Firefox (latest)
|
| 89 |
+
- Safari (latest)
|
| 90 |
+
- Edge (latest)
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
## Version History
|
| 95 |
+
|
| 96 |
+
- **1.0.0**: Initial stable release with full feature set
|
| 97 |
+
- **0.0.1**: Development version (pre-release)
|
| 98 |
+
|
| 99 |
+
## Migration Guide
|
| 100 |
+
|
| 101 |
+
### From 0.0.1 to 1.0.0
|
| 102 |
+
|
| 103 |
+
This is the first stable release. No breaking changes from the development version.
|
| 104 |
+
|
| 105 |
+
### Updating Your Project
|
| 106 |
+
|
| 107 |
+
Use the template synchronization system to update:
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
npm run sync:template -- --dry-run # Preview changes
|
| 111 |
+
npm run sync:template # Apply updates
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
## Support
|
| 115 |
+
|
| 116 |
+
- **Documentation**: [Hugging Face Space](https://huggingface.co/spaces/tfrere/research-article-template)
|
| 117 |
+
- **Issues**: [Community Discussions](https://huggingface.co/spaces/tfrere/research-article-template/discussions)
|
| 118 |
+
- **Contact**: [@tfrere](https://huggingface.co/tfrere)
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to Research Article Template
|
| 2 |
+
|
| 3 |
+
Thank you for your interest in contributing to the Research Article Template! This document provides guidelines and information for contributors.
|
| 4 |
+
|
| 5 |
+
## 🤝 How to Contribute
|
| 6 |
+
|
| 7 |
+
### Reporting Issues
|
| 8 |
+
|
| 9 |
+
Before creating an issue, please:
|
| 10 |
+
1. **Search existing issues** to avoid duplicates
|
| 11 |
+
2. **Use the issue template** when available
|
| 12 |
+
3. **Provide detailed information**:
|
| 13 |
+
- Clear description of the problem
|
| 14 |
+
- Steps to reproduce
|
| 15 |
+
- Expected vs actual behavior
|
| 16 |
+
- Environment details (OS, Node.js version, browser)
|
| 17 |
+
- Screenshots if applicable
|
| 18 |
+
|
| 19 |
+
### Suggesting Features
|
| 20 |
+
|
| 21 |
+
We welcome feature suggestions! Please:
|
| 22 |
+
1. **Check existing discussions** first
|
| 23 |
+
2. **Describe the use case** clearly
|
| 24 |
+
3. **Explain the benefits** for the community
|
| 25 |
+
4. **Consider implementation complexity**
|
| 26 |
+
|
| 27 |
+
### Code Contributions
|
| 28 |
+
|
| 29 |
+
#### Getting Started
|
| 30 |
+
|
| 31 |
+
1. **Fork the repository** on Hugging Face
|
| 32 |
+
2. **Clone your fork**:
|
| 33 |
+
```bash
|
| 34 |
+
git clone git@hf.co:spaces/<your-username>/research-article-template
|
| 35 |
+
cd research-article-template
|
| 36 |
+
```
|
| 37 |
+
3. **Install dependencies**:
|
| 38 |
+
```bash
|
| 39 |
+
cd app
|
| 40 |
+
npm install
|
| 41 |
+
```
|
| 42 |
+
4. **Create a feature branch**:
|
| 43 |
+
```bash
|
| 44 |
+
git checkout -b feature/your-feature-name
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
#### Development Workflow
|
| 48 |
+
|
| 49 |
+
1. **Make your changes** following our coding standards
|
| 50 |
+
2. **Test thoroughly**:
|
| 51 |
+
```bash
|
| 52 |
+
npm run dev # Test locally
|
| 53 |
+
npm run build # Ensure build works
|
| 54 |
+
```
|
| 55 |
+
3. **Update documentation** if needed
|
| 56 |
+
4. **Commit with clear messages**:
|
| 57 |
+
```bash
|
| 58 |
+
git commit -m "feat: add new component for interactive charts"
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
#### Pull Request Process
|
| 62 |
+
|
| 63 |
+
1. **Push your branch**:
|
| 64 |
+
```bash
|
| 65 |
+
git push origin feature/your-feature-name
|
| 66 |
+
```
|
| 67 |
+
2. **Create a Pull Request** with:
|
| 68 |
+
- Clear title and description
|
| 69 |
+
- Reference related issues
|
| 70 |
+
- Screenshots for UI changes
|
| 71 |
+
- Testing instructions
|
| 72 |
+
|
| 73 |
+
## 📋 Coding Standards
|
| 74 |
+
|
| 75 |
+
### Code Style
|
| 76 |
+
|
| 77 |
+
- **Use Prettier** for consistent formatting
|
| 78 |
+
- **Follow existing patterns** in the codebase
|
| 79 |
+
- **Write clear, self-documenting code**
|
| 80 |
+
- **Add comments** for complex logic
|
| 81 |
+
- **Use meaningful variable names**
|
| 82 |
+
|
| 83 |
+
### File Organization
|
| 84 |
+
|
| 85 |
+
- **Components**: Place in `src/components/`
|
| 86 |
+
- **Styles**: Use CSS modules or component-scoped styles
|
| 87 |
+
- **Assets**: Organize in `src/content/assets/`
|
| 88 |
+
- **Documentation**: Update relevant `.mdx` files
|
| 89 |
+
|
| 90 |
+
### Commit Message Format
|
| 91 |
+
|
| 92 |
+
We follow [Conventional Commits](https://www.conventionalcommits.org/):
|
| 93 |
+
|
| 94 |
+
```
|
| 95 |
+
type(scope): description
|
| 96 |
+
|
| 97 |
+
feat: add new interactive chart component
|
| 98 |
+
fix: resolve mobile layout issues
|
| 99 |
+
docs: update installation instructions
|
| 100 |
+
style: improve button hover states
|
| 101 |
+
refactor: simplify component structure
|
| 102 |
+
test: add unit tests for utility functions
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
**Types**: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore`
|
| 106 |
+
|
| 107 |
+
## 🧪 Testing
|
| 108 |
+
|
| 109 |
+
### Manual Testing
|
| 110 |
+
|
| 111 |
+
Before submitting:
|
| 112 |
+
- [ ] Test on different screen sizes
|
| 113 |
+
- [ ] Verify dark/light theme compatibility
|
| 114 |
+
- [ ] Check browser compatibility (Chrome, Firefox, Safari)
|
| 115 |
+
- [ ] Test with different content types
|
| 116 |
+
- [ ] Ensure accessibility standards
|
| 117 |
+
|
| 118 |
+
### Automated Testing
|
| 119 |
+
|
| 120 |
+
```bash
|
| 121 |
+
# Run build to catch errors
|
| 122 |
+
npm run build
|
| 123 |
+
|
| 124 |
+
# Test PDF export
|
| 125 |
+
npm run export:pdf
|
| 126 |
+
|
| 127 |
+
# Test LaTeX conversion
|
| 128 |
+
npm run latex:convert
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
## 📚 Documentation
|
| 132 |
+
|
| 133 |
+
### Writing Guidelines
|
| 134 |
+
|
| 135 |
+
- **Use clear, concise language**
|
| 136 |
+
- **Provide examples** for complex features
|
| 137 |
+
- **Include screenshots** for UI changes
|
| 138 |
+
- **Update both English content and code comments**
|
| 139 |
+
|
| 140 |
+
### Documentation Structure
|
| 141 |
+
|
| 142 |
+
- **README.md**: Project overview and quick start
|
| 143 |
+
- **CONTRIBUTING.md**: This file
|
| 144 |
+
- **Content files**: In `src/content/chapters/demo/`
|
| 145 |
+
- **Component docs**: Inline comments and examples
|
| 146 |
+
|
| 147 |
+
## 🎯 Areas for Contribution
|
| 148 |
+
|
| 149 |
+
### High Priority
|
| 150 |
+
|
| 151 |
+
- **Bug fixes** and stability improvements
|
| 152 |
+
- **Accessibility enhancements**
|
| 153 |
+
- **Mobile responsiveness**
|
| 154 |
+
- **Performance optimizations**
|
| 155 |
+
- **Documentation improvements**
|
| 156 |
+
|
| 157 |
+
### Feature Ideas
|
| 158 |
+
|
| 159 |
+
- **New interactive components**
|
| 160 |
+
- **Additional export formats**
|
| 161 |
+
- **Enhanced LaTeX import**
|
| 162 |
+
- **Theme customization**
|
| 163 |
+
- **Plugin system**
|
| 164 |
+
|
| 165 |
+
### Community
|
| 166 |
+
|
| 167 |
+
- **Answer questions** in discussions
|
| 168 |
+
- **Share examples** of your work
|
| 169 |
+
- **Write tutorials** and guides
|
| 170 |
+
- **Help with translations**
|
| 171 |
+
|
| 172 |
+
## 🚫 What Not to Contribute
|
| 173 |
+
|
| 174 |
+
- **Breaking changes** without discussion
|
| 175 |
+
- **Major architectural changes** without approval
|
| 176 |
+
- **Dependencies** that significantly increase bundle size
|
| 177 |
+
- **Features** that don't align with the project's goals
|
| 178 |
+
|
| 179 |
+
## 📞 Getting Help
|
| 180 |
+
|
| 181 |
+
- **Discussions**: [Community tab](https://huggingface.co/spaces/tfrere/research-article-template/discussions)
|
| 182 |
+
- **Issues**: [Report bugs](https://huggingface.co/spaces/tfrere/research-article-template/discussions?status=open&type=issue)
|
| 183 |
+
- **Contact**: [@tfrere](https://huggingface.co/tfrere) on Hugging Face
|
| 184 |
+
|
| 185 |
+
## 📄 License
|
| 186 |
+
|
| 187 |
+
By contributing, you agree that your contributions will be licensed under the same [CC-BY-4.0 license](LICENSE) that covers the project.
|
| 188 |
+
|
| 189 |
+
## 🙏 Recognition
|
| 190 |
+
|
| 191 |
+
Contributors will be:
|
| 192 |
+
- **Listed in acknowledgments** (if desired)
|
| 193 |
+
- **Mentioned in release notes** for significant contributions
|
| 194 |
+
- **Credited** in relevant documentation
|
| 195 |
+
|
| 196 |
+
Thank you for helping make scientific writing more accessible and interactive! 🎉
|
Dockerfile
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Stage 1: Build Astro static site
|
| 2 |
+
FROM node:18-slim AS build
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
COPY app/package*.json ./
|
| 6 |
+
RUN npm install
|
| 7 |
+
|
| 8 |
+
COPY app/ .
|
| 9 |
+
|
| 10 |
+
# Ensure public/data is a real directory (not symlink)
|
| 11 |
+
RUN set -e; \
|
| 12 |
+
rm -rf public/data; \
|
| 13 |
+
mkdir -p public/data; \
|
| 14 |
+
cp -a src/content/assets/data/. public/data/
|
| 15 |
+
|
| 16 |
+
RUN npm run build
|
| 17 |
+
|
| 18 |
+
# Stage 2: Python backend + nginx + static site
|
| 19 |
+
FROM python:3.10-slim
|
| 20 |
+
|
| 21 |
+
RUN apt-get update && apt-get install -y nginx && rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Install Python backend dependencies
|
| 24 |
+
WORKDIR /backend
|
| 25 |
+
COPY backend/requirements.txt .
|
| 26 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 27 |
+
|
| 28 |
+
# Copy backend code
|
| 29 |
+
COPY backend/ /backend/
|
| 30 |
+
|
| 31 |
+
# Copy built Astro site
|
| 32 |
+
COPY --from=build /app/dist /app/dist
|
| 33 |
+
|
| 34 |
+
# Copy nginx config + entrypoint
|
| 35 |
+
COPY nginx.conf /etc/nginx/nginx.conf
|
| 36 |
+
COPY entrypoint.sh /entrypoint.sh
|
| 37 |
+
RUN chmod +x /entrypoint.sh
|
| 38 |
+
|
| 39 |
+
# Permissions
|
| 40 |
+
RUN mkdir -p /var/cache/nginx /var/run /var/log/nginx /var/lib/nginx/body /tmp && \
|
| 41 |
+
chmod -R 777 /var/cache/nginx /var/run /var/log/nginx /var/lib/nginx /tmp /app /backend
|
| 42 |
+
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
ENTRYPOINT ["/entrypoint.sh"]
|
LICENSE
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Creative Commons Attribution 4.0 International License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 Thibaud Frere
|
| 4 |
+
|
| 5 |
+
This work is licensed under the Creative Commons Attribution 4.0 International License.
|
| 6 |
+
To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/
|
| 7 |
+
or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
|
| 8 |
+
|
| 9 |
+
You are free to:
|
| 10 |
+
|
| 11 |
+
Share — copy and redistribute the material in any medium or format
|
| 12 |
+
Adapt — remix, transform, and build upon the material for any purpose, even commercially.
|
| 13 |
+
|
| 14 |
+
The licensor cannot revoke these freedoms as long as you follow the license terms.
|
| 15 |
+
|
| 16 |
+
Under the following terms:
|
| 17 |
+
|
| 18 |
+
Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
|
| 19 |
+
|
| 20 |
+
No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
|
| 21 |
+
|
| 22 |
+
Notices:
|
| 23 |
+
|
| 24 |
+
You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
|
| 25 |
+
|
| 26 |
+
No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
For the source code and technical implementation:
|
| 31 |
+
- The source code is available at: https://huggingface.co/spaces/tfrere/research-article-template
|
| 32 |
+
- Third-party figures and assets are excluded from this license and marked in their captions
|
| 33 |
+
- Dependencies and third-party libraries maintain their respective licenses
|
NOTION_IMPORT.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📖 Guide d'importation depuis Notion
|
| 2 |
+
|
| 3 |
+
Ce guide explique comment configurer l'importation automatique depuis Notion lors du build de votre Space HuggingFace.
|
| 4 |
+
|
| 5 |
+
## 🎯 Principe de fonctionnement
|
| 6 |
+
|
| 7 |
+
Lors du build Docker sur HuggingFace Spaces, si les variables d'environnement sont configurées :
|
| 8 |
+
1. Le script va chercher votre page Notion
|
| 9 |
+
2. Extrait automatiquement le titre et génère le slug
|
| 10 |
+
3. Convertit le contenu en MDX
|
| 11 |
+
4. Build l'application avec le nouveau contenu
|
| 12 |
+
|
| 13 |
+
**Avantage :** Vous modifiez votre article dans Notion, puis vous cliquez sur "Factory Reboot" dans HF Spaces → le site est automatiquement mis à jour !
|
| 14 |
+
|
| 15 |
+
## ⚙️ Configuration sur HuggingFace Spaces
|
| 16 |
+
|
| 17 |
+
### 1. Créer une intégration Notion
|
| 18 |
+
|
| 19 |
+
1. Allez sur https://www.notion.so/my-integrations
|
| 20 |
+
2. Cliquez sur "New integration"
|
| 21 |
+
3. Donnez un nom (ex: "HF Article Importer")
|
| 22 |
+
4. Sélectionnez votre workspace
|
| 23 |
+
5. Cliquez sur "Submit"
|
| 24 |
+
6. **Copiez le token** (format: `secret_xxxxx...`)
|
| 25 |
+
|
| 26 |
+
### 2. Partager votre page Notion avec l'intégration
|
| 27 |
+
|
| 28 |
+
1. Ouvrez votre page Notion
|
| 29 |
+
2. Cliquez sur "Share" (en haut à droite)
|
| 30 |
+
3. Cliquez sur "Invite"
|
| 31 |
+
4. Recherchez le nom de votre intégration
|
| 32 |
+
5. Sélectionnez-la et donnez la permission "Can read content"
|
| 33 |
+
6. Cliquez sur "Invite"
|
| 34 |
+
|
| 35 |
+
### 3. Récupérer l'ID de votre page Notion
|
| 36 |
+
|
| 37 |
+
L'ID se trouve dans l'URL de votre page :
|
| 38 |
+
```
|
| 39 |
+
https://www.notion.so/Mon-Article-27877f1c9c9d804d9c82f7b3905578ff
|
| 40 |
+
└─────────────────┬─────────────────┘
|
| 41 |
+
C'est cet ID !
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Exemple : `27877f1c9c9d804d9c82f7b3905578ff`
|
| 45 |
+
|
| 46 |
+
### 4. Configurer les variables d'environnement sur HF Spaces
|
| 47 |
+
|
| 48 |
+
1. Allez dans les Settings de votre Space
|
| 49 |
+
2. Section "Repository secrets"
|
| 50 |
+
3. Ajoutez ces 3 variables :
|
| 51 |
+
|
| 52 |
+
| Variable | Valeur | Secret ? |
|
| 53 |
+
|----------|--------|----------|
|
| 54 |
+
| `ENABLE_NOTION_IMPORT` | `true` | Non |
|
| 55 |
+
| `NOTION_TOKEN` | `secret_xxx...` | **Oui** ✅ |
|
| 56 |
+
| `NOTION_PAGE_ID` | `27877f1c...` | Non |
|
| 57 |
+
|
| 58 |
+
**Important :** Cochez la case "Secret" pour `NOTION_TOKEN` uniquement !
|
| 59 |
+
|
| 60 |
+
### 5. Rebuild votre Space
|
| 61 |
+
|
| 62 |
+
1. Allez dans l'onglet "Settings"
|
| 63 |
+
2. Cliquez sur "Factory reboot"
|
| 64 |
+
3. Attendez le rebuild (~5-10 minutes)
|
| 65 |
+
4. Votre article Notion est maintenant publié ! 🎉
|
| 66 |
+
|
| 67 |
+
## 🔄 Workflow de mise à jour
|
| 68 |
+
|
| 69 |
+
```
|
| 70 |
+
┌─────────────────────────┐
|
| 71 |
+
│ 1. Éditez dans Notion │
|
| 72 |
+
│ (brouillon privé) │
|
| 73 |
+
└───────────┬─────────────┘
|
| 74 |
+
│
|
| 75 |
+
↓
|
| 76 |
+
┌─────────────────────────┐
|
| 77 |
+
│ 2. Vérifiez le contenu │
|
| 78 |
+
│ (preview Notion) │
|
| 79 |
+
└───────────┬─────────────┘
|
| 80 |
+
│
|
| 81 |
+
↓
|
| 82 |
+
┌─────────────────────────┐
|
| 83 |
+
│ 3. HF Spaces → │
|
| 84 |
+
│ "Factory Reboot" │
|
| 85 |
+
└───────────┬─────────────┘
|
| 86 |
+
│
|
| 87 |
+
↓
|
| 88 |
+
┌─────────────────────────┐
|
| 89 |
+
│ 4. Attendez 5-10 min │
|
| 90 |
+
│ (build Docker) │
|
| 91 |
+
└───────────┬─────────────┘
|
| 92 |
+
│
|
| 93 |
+
↓
|
| 94 |
+
┌─────────────────────────┐
|
| 95 |
+
│ 5. Site mis à jour ! ✅ │
|
| 96 |
+
│ (zéro downtime) │
|
| 97 |
+
└─────────────────────────┘
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
## 🧪 Test en local
|
| 101 |
+
|
| 102 |
+
Avant de publier, vous pouvez tester en local :
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
# 1. Créer un fichier .env dans app/scripts/notion-importer/
|
| 106 |
+
cd app/scripts/notion-importer
|
| 107 |
+
cp env.example .env
|
| 108 |
+
|
| 109 |
+
# 2. Éditer .env avec vos credentials
|
| 110 |
+
# NOTION_TOKEN=secret_xxx
|
| 111 |
+
# NOTION_PAGE_ID=abc123
|
| 112 |
+
|
| 113 |
+
# 3. Installer les dépendances
|
| 114 |
+
npm install
|
| 115 |
+
|
| 116 |
+
# 4. Lancer l'import
|
| 117 |
+
node index.mjs
|
| 118 |
+
|
| 119 |
+
# 5. Le contenu est copié dans app/src/content/article.mdx
|
| 120 |
+
# Les images dans app/src/content/assets/image/
|
| 121 |
+
|
| 122 |
+
# 6. Lancer le serveur de dev Astro
|
| 123 |
+
cd ../.. # Retour à app/
|
| 124 |
+
npm run dev
|
| 125 |
+
|
| 126 |
+
# 7. Ouvrir http://localhost:4321
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## 📋 Fonctionnalités supportées
|
| 130 |
+
|
| 131 |
+
### ✅ Supporté automatiquement
|
| 132 |
+
- Texte formaté (gras, italique, code inline)
|
| 133 |
+
- Titres (h1, h2, h3, etc.)
|
| 134 |
+
- Listes (ordonnées, non-ordonnées)
|
| 135 |
+
- Images (téléchargées et converties)
|
| 136 |
+
- Liens externes
|
| 137 |
+
- Blocs de code avec syntaxe
|
| 138 |
+
- Callouts → Composant `Note`
|
| 139 |
+
- Tables → Composant stylisé
|
| 140 |
+
- Citations
|
| 141 |
+
- Équations LaTeX (inline et bloc)
|
| 142 |
+
|
| 143 |
+
### ⚠️ Conversion manuelle requise
|
| 144 |
+
- Bases de données Notion → Créer en MDX
|
| 145 |
+
- Toggles → Utiliser `Accordion`
|
| 146 |
+
- Embeds complexes → Utiliser `HtmlEmbed`
|
| 147 |
+
- Graphiques → Utiliser `Trackio` ou d3.js
|
| 148 |
+
|
| 149 |
+
## 🔧 Désactiver l'import Notion
|
| 150 |
+
|
| 151 |
+
Pour revenir à l'édition manuelle du MDX :
|
| 152 |
+
|
| 153 |
+
1. HF Spaces → Settings → Repository secrets
|
| 154 |
+
2. Changez `ENABLE_NOTION_IMPORT` à `false`
|
| 155 |
+
3. Ou supprimez les variables d'env
|
| 156 |
+
|
| 157 |
+
Le site continuera de fonctionner avec le dernier contenu importé.
|
| 158 |
+
|
| 159 |
+
## 🆘 Dépannage
|
| 160 |
+
|
| 161 |
+
### Erreur "❌ NOTION_TOKEN not found"
|
| 162 |
+
→ Vérifiez que vous avez bien créé la variable `NOTION_TOKEN` dans les secrets HF
|
| 163 |
+
|
| 164 |
+
### Erreur "❌ Could not find Notion page"
|
| 165 |
+
→ Vérifiez que vous avez bien partagé la page avec votre intégration Notion
|
| 166 |
+
|
| 167 |
+
### L'import ne se lance pas au build
|
| 168 |
+
→ Vérifiez que `ENABLE_NOTION_IMPORT=true` (sans guillemets)
|
| 169 |
+
|
| 170 |
+
### Le build échoue pendant l'import
|
| 171 |
+
→ Regardez les logs du build dans HF Spaces pour voir l'erreur exacte
|
| 172 |
+
|
| 173 |
+
## 💡 Conseils
|
| 174 |
+
|
| 175 |
+
1. **Testez en local d'abord** : Évitez les surprises en prod
|
| 176 |
+
2. **Structure claire** : Utilisez bien les titres h1, h2, h3 dans Notion
|
| 177 |
+
3. **Images optimisées** : Les images sont téléchargées et intégrées
|
| 178 |
+
4. **Commits Git** : Pour un vrai versioning, committez aussi les MDX générés
|
| 179 |
+
5. **Brouillons** : Gardez des pages privées pour vos brouillons Notion
|
| 180 |
+
|
| 181 |
+
## 📚 Pour aller plus loin
|
| 182 |
+
|
| 183 |
+
- [Documentation Notion API](https://developers.notion.com/)
|
| 184 |
+
- [Documentation HuggingFace Spaces](https://huggingface.co/docs/hub/spaces)
|
| 185 |
+
- [README du Notion Importer](./app/scripts/notion-importer/README.md)
|
| 186 |
+
|
README.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: 'CorrSteer: Correlation-Based Steering of Language Models via Sparse Autoencoders'
|
| 3 |
+
short_desc: 'Interactive article with live GPT-2 steering demo'
|
| 4 |
+
emoji: 🧭
|
| 5 |
+
colorFrom: blue
|
| 6 |
+
colorTo: indigo
|
| 7 |
+
sdk: docker
|
| 8 |
+
pinned: false
|
| 9 |
+
header: mini
|
| 10 |
+
app_port: 7860
|
| 11 |
+
tags:
|
| 12 |
+
- research-article-template
|
| 13 |
+
- research paper
|
| 14 |
+
- scientific paper
|
| 15 |
+
- data visualization
|
| 16 |
+
- mechanistic interpretability
|
| 17 |
+
- sparse autoencoders
|
| 18 |
+
---
|
app/astro.config.mjs
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from 'astro/config';
|
| 2 |
+
import mdx from '@astrojs/mdx';
|
| 3 |
+
import svelte from '@astrojs/svelte';
|
| 4 |
+
import mermaid from 'astro-mermaid';
|
| 5 |
+
import compressor from 'astro-compressor';
|
| 6 |
+
import remarkMath from 'remark-math';
|
| 7 |
+
import rehypeKatex from 'rehype-katex';
|
| 8 |
+
import remarkFootnotes from 'remark-footnotes';
|
| 9 |
+
import rehypeSlug from 'rehype-slug';
|
| 10 |
+
import rehypeAutolinkHeadings from 'rehype-autolink-headings';
|
| 11 |
+
import rehypeCitation from 'rehype-citation';
|
| 12 |
+
import rehypeCodeCopy from './plugins/rehype/code-copy.mjs';
|
| 13 |
+
import rehypeReferencesAndFootnotes from './plugins/rehype/post-citation.mjs';
|
| 14 |
+
import remarkIgnoreCitationsInCode from './plugins/remark/ignore-citations-in-code.mjs';
|
| 15 |
+
import remarkUnwrapCitationLinks from './plugins/remark/unwrap-citation-links.mjs';
|
| 16 |
+
import remarkDirective from 'remark-directive';
|
| 17 |
+
import remarkOutputContainer from './plugins/remark/output-container.mjs';
|
| 18 |
+
import rehypeRestoreAtInCode from './plugins/rehype/restore-at-in-code.mjs';
|
| 19 |
+
import rehypeWrapTables from './plugins/rehype/wrap-tables.mjs';
|
| 20 |
+
import rehypeWrapOutput from './plugins/rehype/wrap-outputs.mjs';
|
| 21 |
+
// Built-in Shiki (dual themes) — no rehype-pretty-code
|
| 22 |
+
|
| 23 |
+
// Plugins moved to app/plugins/*
|
| 24 |
+
|
| 25 |
+
export default defineConfig({
|
| 26 |
+
output: 'static',
|
| 27 |
+
integrations: [
|
| 28 |
+
mermaid({ theme: 'neutral', autoTheme: true }),
|
| 29 |
+
mdx(),
|
| 30 |
+
svelte(),
|
| 31 |
+
// Precompress output with Gzip only (Brotli disabled due to server module mismatch)
|
| 32 |
+
compressor({ brotli: false, gzip: true })
|
| 33 |
+
],
|
| 34 |
+
devToolbar: {
|
| 35 |
+
enabled: false
|
| 36 |
+
},
|
| 37 |
+
markdown: {
|
| 38 |
+
shikiConfig: {
|
| 39 |
+
themes: {
|
| 40 |
+
light: 'github-light',
|
| 41 |
+
dark: 'github-dark'
|
| 42 |
+
},
|
| 43 |
+
defaultColor: false,
|
| 44 |
+
wrap: false,
|
| 45 |
+
langAlias: {
|
| 46 |
+
// Map MDX fences to TSX for better JSX tokenization
|
| 47 |
+
mdx: 'tsx'
|
| 48 |
+
}
|
| 49 |
+
},
|
| 50 |
+
remarkPlugins: [
|
| 51 |
+
remarkUnwrapCitationLinks,
|
| 52 |
+
remarkIgnoreCitationsInCode,
|
| 53 |
+
remarkMath,
|
| 54 |
+
[remarkFootnotes, { inlineNotes: true }],
|
| 55 |
+
remarkDirective,
|
| 56 |
+
remarkOutputContainer
|
| 57 |
+
],
|
| 58 |
+
rehypePlugins: [
|
| 59 |
+
rehypeSlug,
|
| 60 |
+
[rehypeAutolinkHeadings, { behavior: 'wrap' }],
|
| 61 |
+
[rehypeKatex, {
|
| 62 |
+
trust: true,
|
| 63 |
+
}],
|
| 64 |
+
[rehypeCitation, {
|
| 65 |
+
bibliography: 'src/content/bibliography.bib',
|
| 66 |
+
linkCitations: true,
|
| 67 |
+
csl: "apa",
|
| 68 |
+
noCite: false,
|
| 69 |
+
suppressBibliography: false,
|
| 70 |
+
}],
|
| 71 |
+
rehypeReferencesAndFootnotes,
|
| 72 |
+
rehypeRestoreAtInCode,
|
| 73 |
+
rehypeCodeCopy,
|
| 74 |
+
rehypeWrapOutput,
|
| 75 |
+
rehypeWrapTables
|
| 76 |
+
]
|
| 77 |
+
}
|
| 78 |
+
});
|
| 79 |
+
|
| 80 |
+
|
app/package-lock.json
ADDED
|
Binary file (450 kB). View file
|
|
|
app/package.json
ADDED
|
Binary file (2.49 kB). View file
|
|
|
app/plugins/rehype/code-copy.mjs
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Minimal rehype plugin to wrap code blocks with a copy button
|
| 2 |
+
// Exported as a standalone module to keep astro.config.mjs lean
|
| 3 |
+
export default function rehypeCodeCopy() {
|
| 4 |
+
return (tree) => {
|
| 5 |
+
// Walk the tree; lightweight visitor to find <pre><code>
|
| 6 |
+
const visit = (node, parent) => {
|
| 7 |
+
if (!node || typeof node !== 'object') return;
|
| 8 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 9 |
+
if (node.tagName === 'pre' && children.some(c => c.tagName === 'code')) {
|
| 10 |
+
// Find code child
|
| 11 |
+
const code = children.find(c => c.tagName === 'code');
|
| 12 |
+
// Determine if single-line block: prefer Shiki lines, then text content
|
| 13 |
+
const countLinesFromShiki = () => {
|
| 14 |
+
const isLineEl = (el) => el && el.type === 'element' && el.tagName === 'span' && Array.isArray(el.properties?.className) && el.properties.className.includes('line');
|
| 15 |
+
const hasNonWhitespaceText = (node) => {
|
| 16 |
+
if (!node) return false;
|
| 17 |
+
if (node.type === 'text') return /\S/.test(String(node.value || ''));
|
| 18 |
+
const kids = Array.isArray(node.children) ? node.children : [];
|
| 19 |
+
return kids.some(hasNonWhitespaceText);
|
| 20 |
+
};
|
| 21 |
+
const collectLines = (node, acc) => {
|
| 22 |
+
if (!node || typeof node !== 'object') return;
|
| 23 |
+
if (isLineEl(node)) acc.push(node);
|
| 24 |
+
const kids = Array.isArray(node.children) ? node.children : [];
|
| 25 |
+
kids.forEach((k) => collectLines(k, acc));
|
| 26 |
+
};
|
| 27 |
+
const lines = [];
|
| 28 |
+
collectLines(code, lines);
|
| 29 |
+
const nonEmpty = lines.filter((ln) => hasNonWhitespaceText(ln)).length;
|
| 30 |
+
return nonEmpty || 0;
|
| 31 |
+
};
|
| 32 |
+
const countLinesFromText = () => {
|
| 33 |
+
// Parse raw text content of the <code> node including nested spans
|
| 34 |
+
const extractText = (node) => {
|
| 35 |
+
if (!node) return '';
|
| 36 |
+
if (node.type === 'text') return String(node.value || '');
|
| 37 |
+
const kids = Array.isArray(node.children) ? node.children : [];
|
| 38 |
+
return kids.map(extractText).join('');
|
| 39 |
+
};
|
| 40 |
+
const raw = extractText(code);
|
| 41 |
+
if (!raw || !/\S/.test(raw)) return 0;
|
| 42 |
+
return raw.split('\n').filter(line => /\S/.test(line)).length;
|
| 43 |
+
};
|
| 44 |
+
const lines = countLinesFromShiki() || countLinesFromText();
|
| 45 |
+
const isSingleLine = lines <= 1;
|
| 46 |
+
// Also treat code blocks shorter than a threshold as single-line (defensive)
|
| 47 |
+
if (!isSingleLine) {
|
| 48 |
+
const approxChars = (() => {
|
| 49 |
+
const extract = (n) => Array.isArray(n?.children) ? n.children.map(extract).join('') : (n?.type === 'text' ? String(n.value||'') : '');
|
| 50 |
+
return extract(code).length;
|
| 51 |
+
})();
|
| 52 |
+
if (approxChars < 6) {
|
| 53 |
+
node.__forceSingle = true;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
// Replace <pre> with wrapper div.code-card containing button + pre
|
| 57 |
+
const wrapper = {
|
| 58 |
+
type: 'element',
|
| 59 |
+
tagName: 'div',
|
| 60 |
+
properties: { className: ['code-card'].concat((isSingleLine || node.__forceSingle) ? ['no-copy'] : []) },
|
| 61 |
+
children: (isSingleLine || node.__forceSingle) ? [ node ] : [
|
| 62 |
+
{
|
| 63 |
+
type: 'element',
|
| 64 |
+
tagName: 'button',
|
| 65 |
+
properties: { className: ['code-copy', 'button--ghost'], type: 'button', 'aria-label': 'Copy code' },
|
| 66 |
+
children: [
|
| 67 |
+
{
|
| 68 |
+
type: 'element',
|
| 69 |
+
tagName: 'svg',
|
| 70 |
+
properties: { viewBox: '0 0 24 24', 'aria-hidden': 'true', focusable: 'false' },
|
| 71 |
+
children: [
|
| 72 |
+
{ type: 'element', tagName: 'path', properties: { d: 'M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z' }, children: [] }
|
| 73 |
+
]
|
| 74 |
+
}
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
node
|
| 78 |
+
]
|
| 79 |
+
};
|
| 80 |
+
if (parent && Array.isArray(parent.children)) {
|
| 81 |
+
const idx = parent.children.indexOf(node);
|
| 82 |
+
if (idx !== -1) parent.children[idx] = wrapper;
|
| 83 |
+
}
|
| 84 |
+
return; // don't visit nested
|
| 85 |
+
}
|
| 86 |
+
children.forEach((c) => visit(c, node));
|
| 87 |
+
};
|
| 88 |
+
visit(tree, null);
|
| 89 |
+
};
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
app/plugins/rehype/post-citation.mjs
ADDED
|
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// rehype plugin to post-process citations and footnotes at build-time
|
| 2 |
+
// - Normalizes the bibliography into <ol class="references"> with <li id="...">
|
| 3 |
+
// - Linkifies DOI/URL occurrences inside references
|
| 4 |
+
// - Appends back-reference links (↩ back: 1, 2, ...) from each reference to in-text citation anchors
|
| 5 |
+
// - Cleans up footnotes block (.footnotes)
|
| 6 |
+
|
| 7 |
+
export default function rehypeReferencesAndFootnotes() {
|
| 8 |
+
return (tree) => {
|
| 9 |
+
const isElement = (n) => n && typeof n === 'object' && n.type === 'element';
|
| 10 |
+
const getChildren = (n) => (Array.isArray(n?.children) ? n.children : []);
|
| 11 |
+
|
| 12 |
+
const walk = (node, parent, fn) => {
|
| 13 |
+
if (!node || typeof node !== 'object') return;
|
| 14 |
+
fn && fn(node, parent);
|
| 15 |
+
const kids = getChildren(node);
|
| 16 |
+
for (const child of kids) walk(child, node, fn);
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
const ensureArray = (v) => (Array.isArray(v) ? v : v != null ? [v] : []);
|
| 20 |
+
|
| 21 |
+
const hasClass = (el, name) => {
|
| 22 |
+
const cn = ensureArray(el?.properties?.className).map(String);
|
| 23 |
+
return cn.includes(name);
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
const setAttr = (el, key, val) => {
|
| 27 |
+
el.properties = el.properties || {};
|
| 28 |
+
if (val == null) delete el.properties[key];
|
| 29 |
+
else el.properties[key] = val;
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
const getAttr = (el, key) => (el?.properties ? el.properties[key] : undefined);
|
| 33 |
+
|
| 34 |
+
// Shared helpers for backlinks + backrefs block
|
| 35 |
+
const collectBacklinksForIdSet = (idSet, anchorPrefix) => {
|
| 36 |
+
const idToBacklinks = new Map();
|
| 37 |
+
const idToAnchorNodes = new Map();
|
| 38 |
+
if (!idSet || idSet.size === 0) return { idToBacklinks, idToAnchorNodes };
|
| 39 |
+
walk(tree, null, (node) => {
|
| 40 |
+
if (!isElement(node) || node.tagName !== 'a') return;
|
| 41 |
+
const href = String(getAttr(node, 'href') || '');
|
| 42 |
+
if (!href.startsWith('#')) return;
|
| 43 |
+
const id = href.slice(1);
|
| 44 |
+
if (!idSet.has(id)) return;
|
| 45 |
+
// Ensure a stable id
|
| 46 |
+
let anchorId = String(getAttr(node, 'id') || '');
|
| 47 |
+
if (!anchorId) {
|
| 48 |
+
const list = idToBacklinks.get(id) || [];
|
| 49 |
+
anchorId = `${anchorPrefix}-${id}-${list.length + 1}`;
|
| 50 |
+
setAttr(node, 'id', anchorId);
|
| 51 |
+
}
|
| 52 |
+
const list = idToBacklinks.get(id) || [];
|
| 53 |
+
list.push(anchorId);
|
| 54 |
+
idToBacklinks.set(id, list);
|
| 55 |
+
const nodes = idToAnchorNodes.get(id) || [];
|
| 56 |
+
nodes.push(node);
|
| 57 |
+
idToAnchorNodes.set(id, nodes);
|
| 58 |
+
});
|
| 59 |
+
return { idToBacklinks, idToAnchorNodes };
|
| 60 |
+
};
|
| 61 |
+
|
| 62 |
+
const createBackIcon = () => ({
|
| 63 |
+
type: 'element',
|
| 64 |
+
tagName: 'svg',
|
| 65 |
+
properties: {
|
| 66 |
+
className: ['back-icon'],
|
| 67 |
+
width: 12,
|
| 68 |
+
height: 12,
|
| 69 |
+
viewBox: '0 0 24 24',
|
| 70 |
+
fill: 'none',
|
| 71 |
+
stroke: 'currentColor',
|
| 72 |
+
'stroke-width': 2,
|
| 73 |
+
'stroke-linecap': 'round',
|
| 74 |
+
'stroke-linejoin': 'round',
|
| 75 |
+
'aria-hidden': 'true',
|
| 76 |
+
focusable: 'false'
|
| 77 |
+
},
|
| 78 |
+
children: [
|
| 79 |
+
{ type: 'element', tagName: 'line', properties: { x1: 12, y1: 19, x2: 12, y2: 5 }, children: [] },
|
| 80 |
+
{ type: 'element', tagName: 'polyline', properties: { points: '5 12 12 5 19 12' }, children: [] }
|
| 81 |
+
]
|
| 82 |
+
});
|
| 83 |
+
|
| 84 |
+
const appendBackrefsBlock = (listElement, idToBacklinks, ariaLabel) => {
|
| 85 |
+
if (!listElement || !idToBacklinks || idToBacklinks.size === 0) return;
|
| 86 |
+
for (const li of getChildren(listElement)) {
|
| 87 |
+
if (!isElement(li) || li.tagName !== 'li') continue;
|
| 88 |
+
const id = String(getAttr(li, 'id') || '');
|
| 89 |
+
if (!id) continue;
|
| 90 |
+
const keys = idToBacklinks.get(id);
|
| 91 |
+
if (!keys || !keys.length) continue;
|
| 92 |
+
// Remove pre-existing .backrefs in this li to avoid duplicates
|
| 93 |
+
li.children = getChildren(li).filter((n) => !(isElement(n) && n.tagName === 'small' && hasClass(n, 'backrefs')));
|
| 94 |
+
const small = {
|
| 95 |
+
type: 'element',
|
| 96 |
+
tagName: 'small',
|
| 97 |
+
properties: { className: ['backrefs'] },
|
| 98 |
+
children: []
|
| 99 |
+
};
|
| 100 |
+
if (keys.length === 1) {
|
| 101 |
+
// Single backlink: just the icon wrapped in the anchor
|
| 102 |
+
const a = {
|
| 103 |
+
type: 'element',
|
| 104 |
+
tagName: 'a',
|
| 105 |
+
properties: { href: `#${keys[0]}`, 'aria-label': ariaLabel },
|
| 106 |
+
children: [createBackIcon()]
|
| 107 |
+
};
|
| 108 |
+
small.children.push(a);
|
| 109 |
+
} else {
|
| 110 |
+
// Multiple backlinks: icon + label + numbered links
|
| 111 |
+
small.children.push(createBackIcon());
|
| 112 |
+
small.children.push({ type: 'text', value: ' back: ' });
|
| 113 |
+
keys.forEach((backId, idx) => {
|
| 114 |
+
small.children.push({
|
| 115 |
+
type: 'element',
|
| 116 |
+
tagName: 'a',
|
| 117 |
+
properties: { href: `#${backId}`, 'aria-label': ariaLabel },
|
| 118 |
+
children: [{ type: 'text', value: String(idx + 1) }]
|
| 119 |
+
});
|
| 120 |
+
if (idx < keys.length - 1) small.children.push({ type: 'text', value: ', ' });
|
| 121 |
+
});
|
| 122 |
+
}
|
| 123 |
+
li.children.push(small);
|
| 124 |
+
}
|
| 125 |
+
};
|
| 126 |
+
// Remove default back-reference anchors generated by remark-footnotes inside a footnote item
|
| 127 |
+
const getTextContent = (el) => {
|
| 128 |
+
if (!el) return '';
|
| 129 |
+
const stack = [el];
|
| 130 |
+
let out = '';
|
| 131 |
+
while (stack.length) {
|
| 132 |
+
const cur = stack.pop();
|
| 133 |
+
if (!cur) continue;
|
| 134 |
+
if (cur.type === 'text') out += String(cur.value || '');
|
| 135 |
+
const kids = getChildren(cur);
|
| 136 |
+
for (let i = kids.length - 1; i >= 0; i--) stack.push(kids[i]);
|
| 137 |
+
}
|
| 138 |
+
return out;
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
const removeFootnoteBackrefAnchors = (el) => {
|
| 142 |
+
if (!isElement(el)) return;
|
| 143 |
+
const kids = getChildren(el);
|
| 144 |
+
for (let i = kids.length - 1; i >= 0; i--) {
|
| 145 |
+
const child = kids[i];
|
| 146 |
+
if (isElement(child)) {
|
| 147 |
+
if (
|
| 148 |
+
child.tagName === 'a' && (
|
| 149 |
+
getAttr(child, 'data-footnote-backref') != null ||
|
| 150 |
+
hasClass(child, 'footnote-backref') ||
|
| 151 |
+
String(getAttr(child, 'role') || '').toLowerCase() === 'doc-backlink' ||
|
| 152 |
+
String(getAttr(child, 'aria-label') || '').toLowerCase().includes('back to content') ||
|
| 153 |
+
String(getAttr(child, 'href') || '').startsWith('#fnref') ||
|
| 154 |
+
// Fallback: text-based detection like "↩" or "↩2"
|
| 155 |
+
/^\s*↩\s*\d*\s*$/u.test(getTextContent(child))
|
| 156 |
+
)
|
| 157 |
+
) {
|
| 158 |
+
// Remove the anchor
|
| 159 |
+
el.children.splice(i, 1);
|
| 160 |
+
continue;
|
| 161 |
+
}
|
| 162 |
+
// Recurse into element
|
| 163 |
+
removeFootnoteBackrefAnchors(child);
|
| 164 |
+
// If a wrapper like <sup> or <span> became empty, remove it
|
| 165 |
+
const becameKids = getChildren(child);
|
| 166 |
+
if ((child.tagName === 'sup' || child.tagName === 'span') && (!becameKids || becameKids.length === 0)) {
|
| 167 |
+
el.children.splice(i, 1);
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
};
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
const normDoiHref = (href) => {
|
| 175 |
+
if (!href) return href;
|
| 176 |
+
const DUP = /https?:\/\/(?:dx\.)?doi\.org\/(?:https?:\/\/(?:dx\.)?doi\.org\/)+/gi;
|
| 177 |
+
const ONE = /https?:\/\/(?:dx\.)?doi\.org\/(10\.[^\s<>"']+)/i;
|
| 178 |
+
href = String(href).replace(DUP, 'https://doi.org/');
|
| 179 |
+
const m = href.match(ONE);
|
| 180 |
+
return m ? `https://doi.org/${m[1]}` : href;
|
| 181 |
+
};
|
| 182 |
+
|
| 183 |
+
const DOI_BARE = /\b10\.[0-9]{4,9}\/[\-._;()\/:A-Z0-9]+\b/gi;
|
| 184 |
+
const URL_GEN = /\bhttps?:\/\/[^\s<>()"']+/gi;
|
| 185 |
+
|
| 186 |
+
const linkifyTextNode = (textNode) => {
|
| 187 |
+
const text = String(textNode.value || '');
|
| 188 |
+
let last = 0;
|
| 189 |
+
const parts = [];
|
| 190 |
+
const pushText = (s) => { if (s) parts.push({ type: 'text', value: s }); };
|
| 191 |
+
|
| 192 |
+
const matches = [];
|
| 193 |
+
// Collect URL matches
|
| 194 |
+
let m;
|
| 195 |
+
URL_GEN.lastIndex = 0;
|
| 196 |
+
while ((m = URL_GEN.exec(text)) !== null) {
|
| 197 |
+
matches.push({ type: 'url', start: m.index, end: URL_GEN.lastIndex, raw: m[0] });
|
| 198 |
+
}
|
| 199 |
+
// Collect DOI matches
|
| 200 |
+
DOI_BARE.lastIndex = 0;
|
| 201 |
+
while ((m = DOI_BARE.exec(text)) !== null) {
|
| 202 |
+
matches.push({ type: 'doi', start: m.index, end: DOI_BARE.lastIndex, raw: m[0] });
|
| 203 |
+
}
|
| 204 |
+
matches.sort((a, b) => a.start - b.start);
|
| 205 |
+
|
| 206 |
+
for (const match of matches) {
|
| 207 |
+
if (match.start < last) continue; // overlapping
|
| 208 |
+
pushText(text.slice(last, match.start));
|
| 209 |
+
if (match.type === 'url') {
|
| 210 |
+
const href = normDoiHref(match.raw);
|
| 211 |
+
const doiOne = href.match(/https?:\/\/(?:dx\.)?doi\.org\/(10\.[^\s<>"']+)/i);
|
| 212 |
+
const a = {
|
| 213 |
+
type: 'element',
|
| 214 |
+
tagName: 'a',
|
| 215 |
+
properties: { href, target: '_blank', rel: 'noopener noreferrer' },
|
| 216 |
+
children: [{ type: 'text', value: doiOne ? doiOne[1] : href }]
|
| 217 |
+
};
|
| 218 |
+
parts.push(a);
|
| 219 |
+
} else {
|
| 220 |
+
const href = `https://doi.org/${match.raw}`;
|
| 221 |
+
const a = {
|
| 222 |
+
type: 'element',
|
| 223 |
+
tagName: 'a',
|
| 224 |
+
properties: { href, target: '_blank', rel: 'noopener noreferrer' },
|
| 225 |
+
children: [{ type: 'text', value: match.raw }]
|
| 226 |
+
};
|
| 227 |
+
parts.push(a);
|
| 228 |
+
}
|
| 229 |
+
last = match.end;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
pushText(text.slice(last));
|
| 233 |
+
return parts;
|
| 234 |
+
};
|
| 235 |
+
|
| 236 |
+
const linkifyInElement = (el) => {
|
| 237 |
+
const kids = getChildren(el);
|
| 238 |
+
for (let i = 0; i < kids.length; i++) {
|
| 239 |
+
const child = kids[i];
|
| 240 |
+
if (!child) continue;
|
| 241 |
+
if (child.type === 'text') {
|
| 242 |
+
const replacement = linkifyTextNode(child);
|
| 243 |
+
if (replacement.length === 1 && replacement[0].type === 'text') continue;
|
| 244 |
+
// Replace the single text node with multiple nodes
|
| 245 |
+
el.children.splice(i, 1, ...replacement);
|
| 246 |
+
i += replacement.length - 1;
|
| 247 |
+
} else if (isElement(child)) {
|
| 248 |
+
if (child.tagName === 'a') {
|
| 249 |
+
const href = normDoiHref(getAttr(child, 'href'));
|
| 250 |
+
setAttr(child, 'href', href);
|
| 251 |
+
const m = String(href || '').match(/https?:\/\/(?:dx\.)?doi\.org\/(10\.[^\s<>"']+)/i);
|
| 252 |
+
if (m && (!child.children || child.children.length === 0)) {
|
| 253 |
+
child.children = [{ type: 'text', value: m[1] }];
|
| 254 |
+
}
|
| 255 |
+
continue;
|
| 256 |
+
}
|
| 257 |
+
linkifyInElement(child);
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
// Deduplicate adjacent identical anchors
|
| 261 |
+
for (let i = 1; i < el.children.length; i++) {
|
| 262 |
+
const prev = el.children[i - 1];
|
| 263 |
+
const curr = el.children[i];
|
| 264 |
+
if (isElement(prev) && isElement(curr) && prev.tagName === 'a' && curr.tagName === 'a') {
|
| 265 |
+
const key = `${getAttr(prev, 'href') || ''}|${(prev.children?.[0]?.value) || ''}`;
|
| 266 |
+
const key2 = `${getAttr(curr, 'href') || ''}|${(curr.children?.[0]?.value) || ''}`;
|
| 267 |
+
if (key === key2) {
|
| 268 |
+
el.children.splice(i, 1);
|
| 269 |
+
i--;
|
| 270 |
+
}
|
| 271 |
+
}
|
| 272 |
+
}
|
| 273 |
+
};
|
| 274 |
+
|
| 275 |
+
// Find references container and normalize its list
|
| 276 |
+
const findReferencesRoot = () => {
|
| 277 |
+
let found = null;
|
| 278 |
+
walk(tree, null, (node) => {
|
| 279 |
+
if (found) return;
|
| 280 |
+
if (!isElement(node)) return;
|
| 281 |
+
|
| 282 |
+
// Ignore headers (h1, h2, h3, h4, h5, h6) - we only want container elements
|
| 283 |
+
if (/^h[1-6]$/i.test(node.tagName)) return;
|
| 284 |
+
|
| 285 |
+
const id = getAttr(node, 'id');
|
| 286 |
+
if (id === 'references' || id === 'refs' || hasClass(node, 'references') || hasClass(node, 'bibliography')) {
|
| 287 |
+
found = node;
|
| 288 |
+
}
|
| 289 |
+
});
|
| 290 |
+
return found;
|
| 291 |
+
};
|
| 292 |
+
|
| 293 |
+
const toOrderedList = (container) => {
|
| 294 |
+
// If there is already an <ol>, use it; otherwise convert common structures
|
| 295 |
+
let ol = getChildren(container).find((c) => isElement(c) && c.tagName === 'ol');
|
| 296 |
+
if (!ol) {
|
| 297 |
+
ol = { type: 'element', tagName: 'ol', properties: { className: ['references'] }, children: [] };
|
| 298 |
+
const candidates = getChildren(container).filter((n) => isElement(n));
|
| 299 |
+
if (candidates.length) {
|
| 300 |
+
for (const node of candidates) {
|
| 301 |
+
if (hasClass(node, 'csl-entry') || node.tagName === 'li' || node.tagName === 'p' || node.tagName === 'div') {
|
| 302 |
+
const li = { type: 'element', tagName: 'li', properties: {}, children: getChildren(node) };
|
| 303 |
+
if (getAttr(node, 'id')) setAttr(li, 'id', getAttr(node, 'id'));
|
| 304 |
+
ol.children.push(li);
|
| 305 |
+
}
|
| 306 |
+
}
|
| 307 |
+
}
|
| 308 |
+
// Replace container children by the new ol
|
| 309 |
+
container.children = [ol];
|
| 310 |
+
}
|
| 311 |
+
if (!hasClass(ol, 'references')) {
|
| 312 |
+
const cls = ensureArray(ol.properties?.className).map(String);
|
| 313 |
+
if (!cls.includes('references')) cls.push('references');
|
| 314 |
+
ol.properties = ol.properties || {};
|
| 315 |
+
ol.properties.className = cls;
|
| 316 |
+
}
|
| 317 |
+
return ol;
|
| 318 |
+
};
|
| 319 |
+
|
| 320 |
+
const refsRoot = findReferencesRoot();
|
| 321 |
+
let refsOl = null;
|
| 322 |
+
const refIdSet = new Set();
|
| 323 |
+
const refIdToExternalHref = new Map();
|
| 324 |
+
|
| 325 |
+
if (refsRoot) {
|
| 326 |
+
// Add a unique id to avoid collisions with user-created headers
|
| 327 |
+
setAttr(refsRoot, 'id', 'bibliography-references-list');
|
| 328 |
+
setAttr(refsRoot, 'data-bibliography-block', 'true');
|
| 329 |
+
|
| 330 |
+
refsOl = toOrderedList(refsRoot);
|
| 331 |
+
// Collect item ids and linkify their content
|
| 332 |
+
for (const li of getChildren(refsOl)) {
|
| 333 |
+
if (!isElement(li) || li.tagName !== 'li') continue;
|
| 334 |
+
if (!getAttr(li, 'id')) {
|
| 335 |
+
// Try to find a nested element with id to promote
|
| 336 |
+
const nestedWithId = getChildren(li).find((n) => isElement(n) && getAttr(n, 'id'));
|
| 337 |
+
if (nestedWithId) setAttr(li, 'id', getAttr(nestedWithId, 'id'));
|
| 338 |
+
}
|
| 339 |
+
const id = getAttr(li, 'id');
|
| 340 |
+
if (id) refIdSet.add(String(id));
|
| 341 |
+
linkifyInElement(li);
|
| 342 |
+
// Record first external link href (e.g., DOI/URL) if present
|
| 343 |
+
if (id) {
|
| 344 |
+
let externalHref = null;
|
| 345 |
+
const stack = [li];
|
| 346 |
+
while (stack.length) {
|
| 347 |
+
const cur = stack.pop();
|
| 348 |
+
const kids = getChildren(cur);
|
| 349 |
+
for (const k of kids) {
|
| 350 |
+
if (isElement(k) && k.tagName === 'a') {
|
| 351 |
+
const href = String(getAttr(k, 'href') || '');
|
| 352 |
+
if (/^https?:\/\//i.test(href)) {
|
| 353 |
+
externalHref = href;
|
| 354 |
+
break;
|
| 355 |
+
}
|
| 356 |
+
}
|
| 357 |
+
if (isElement(k)) stack.push(k);
|
| 358 |
+
}
|
| 359 |
+
if (externalHref) break;
|
| 360 |
+
}
|
| 361 |
+
if (externalHref) refIdToExternalHref.set(String(id), externalHref);
|
| 362 |
+
}
|
| 363 |
+
}
|
| 364 |
+
setAttr(refsRoot, 'data-built-refs', '1');
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
// Collect in-text anchors that point to references ids
|
| 368 |
+
const { idToBacklinks: refIdToBacklinks, idToAnchorNodes: refIdToCitationAnchors } = collectBacklinksForIdSet(refIdSet, 'refctx');
|
| 369 |
+
|
| 370 |
+
// Append backlinks into references list items
|
| 371 |
+
appendBackrefsBlock(refsOl, refIdToBacklinks, 'Back to citation');
|
| 372 |
+
|
| 373 |
+
// Rewrite in-text citation anchors to external link when available
|
| 374 |
+
if (refIdToCitationAnchors.size > 0) {
|
| 375 |
+
for (const [id, anchors] of refIdToCitationAnchors.entries()) {
|
| 376 |
+
const ext = refIdToExternalHref.get(id);
|
| 377 |
+
if (!ext) continue;
|
| 378 |
+
for (const a of anchors) {
|
| 379 |
+
setAttr(a, 'data-ref-id', id);
|
| 380 |
+
setAttr(a, 'href', ext);
|
| 381 |
+
const existingTarget = getAttr(a, 'target');
|
| 382 |
+
if (!existingTarget) setAttr(a, 'target', '_blank');
|
| 383 |
+
const rel = String(getAttr(a, 'rel') || '');
|
| 384 |
+
const relSet = new Set(rel ? rel.split(/\s+/) : []);
|
| 385 |
+
relSet.add('noopener');
|
| 386 |
+
relSet.add('noreferrer');
|
| 387 |
+
setAttr(a, 'rel', Array.from(relSet).join(' '));
|
| 388 |
+
}
|
| 389 |
+
}
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
// Footnotes cleanup + backrefs harmonized with references
|
| 393 |
+
const cleanupFootnotes = () => {
|
| 394 |
+
let root = null;
|
| 395 |
+
walk(tree, null, (node) => {
|
| 396 |
+
if (!isElement(node)) return;
|
| 397 |
+
if (hasClass(node, 'footnotes')) root = node;
|
| 398 |
+
});
|
| 399 |
+
if (!root) return { root: null, ol: null, idSet: new Set() };
|
| 400 |
+
// Remove <hr> direct children
|
| 401 |
+
root.children = getChildren(root).filter((n) => !(isElement(n) && n.tagName === 'hr'));
|
| 402 |
+
// Ensure an <ol>
|
| 403 |
+
let ol = getChildren(root).find((c) => isElement(c) && c.tagName === 'ol');
|
| 404 |
+
if (!ol) {
|
| 405 |
+
ol = { type: 'element', tagName: 'ol', properties: {}, children: [] };
|
| 406 |
+
const items = getChildren(root).filter((n) => isElement(n) && (n.tagName === 'li' || hasClass(n, 'footnote') || n.tagName === 'p' || n.tagName === 'div'));
|
| 407 |
+
if (items.length) {
|
| 408 |
+
for (const it of items) {
|
| 409 |
+
const li = { type: 'element', tagName: 'li', properties: {}, children: getChildren(it) };
|
| 410 |
+
// Promote nested id if present (e.g., <p id="fn-1">)
|
| 411 |
+
const nestedWithId = getChildren(it).find((n) => isElement(n) && getAttr(n, 'id'));
|
| 412 |
+
if (nestedWithId) setAttr(li, 'id', getAttr(nestedWithId, 'id'));
|
| 413 |
+
ol.children.push(li);
|
| 414 |
+
}
|
| 415 |
+
}
|
| 416 |
+
root.children = [ol];
|
| 417 |
+
}
|
| 418 |
+
// For existing structures, try to promote ids from children when missing
|
| 419 |
+
for (const li of getChildren(ol)) {
|
| 420 |
+
if (!isElement(li) || li.tagName !== 'li') continue;
|
| 421 |
+
if (!getAttr(li, 'id')) {
|
| 422 |
+
const nestedWithId = getChildren(li).find((n) => isElement(n) && getAttr(n, 'id'));
|
| 423 |
+
if (nestedWithId) setAttr(li, 'id', getAttr(nestedWithId, 'id'));
|
| 424 |
+
}
|
| 425 |
+
// Remove default footnote backrefs anywhere inside (to avoid duplication)
|
| 426 |
+
removeFootnoteBackrefAnchors(li);
|
| 427 |
+
}
|
| 428 |
+
setAttr(root, 'data-built-footnotes', '1');
|
| 429 |
+
// Collect id set
|
| 430 |
+
const idSet = new Set();
|
| 431 |
+
for (const li of getChildren(ol)) {
|
| 432 |
+
if (!isElement(li) || li.tagName !== 'li') continue;
|
| 433 |
+
const id = getAttr(li, 'id');
|
| 434 |
+
if (id) idSet.add(String(id));
|
| 435 |
+
}
|
| 436 |
+
return { root, ol, idSet };
|
| 437 |
+
};
|
| 438 |
+
|
| 439 |
+
const { root: footRoot, ol: footOl, idSet: footIdSet } = cleanupFootnotes();
|
| 440 |
+
|
| 441 |
+
// Collect in-text anchors pointing to footnotes
|
| 442 |
+
const { idToBacklinks: footIdToBacklinks } = collectBacklinksForIdSet(footIdSet, 'footctx');
|
| 443 |
+
|
| 444 |
+
// Append backlinks into footnote list items (identical pattern to references)
|
| 445 |
+
appendBackrefsBlock(footOl, footIdToBacklinks, 'Back to footnote call');
|
| 446 |
+
};
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
|
app/plugins/rehype/restore-at-in-code.mjs
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Rehype plugin to restore '@' inside code nodes after rehype-citation ran
|
| 2 |
+
export default function rehypeRestoreAtInCode() {
|
| 3 |
+
return (tree) => {
|
| 4 |
+
const restoreInNode = (node) => {
|
| 5 |
+
if (!node || typeof node !== 'object') return;
|
| 6 |
+
const isText = node.type === 'text';
|
| 7 |
+
if (isText && typeof node.value === 'string' && node.value.includes('__AT_SENTINEL__')) {
|
| 8 |
+
node.value = node.value.replace(/__AT_SENTINEL__/g, '@');
|
| 9 |
+
}
|
| 10 |
+
const isCodeEl = node.type === 'element' && node.tagName === 'code';
|
| 11 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 12 |
+
if (isCodeEl && children.length) {
|
| 13 |
+
children.forEach(restoreInNode);
|
| 14 |
+
return;
|
| 15 |
+
}
|
| 16 |
+
children.forEach(restoreInNode);
|
| 17 |
+
};
|
| 18 |
+
restoreInNode(tree);
|
| 19 |
+
};
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
app/plugins/rehype/wrap-outputs.mjs
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Wrap plain-text content inside <section class="code-outputs"> into a <pre>
|
| 2 |
+
export default function rehypeWrapOutput() {
|
| 3 |
+
return (tree) => {
|
| 4 |
+
const isWhitespace = (value) => typeof value === 'string' && !/\S/.test(value);
|
| 5 |
+
const extractText = (node) => {
|
| 6 |
+
if (!node) return '';
|
| 7 |
+
if (node.type === 'text') return String(node.value || '');
|
| 8 |
+
const kids = Array.isArray(node.children) ? node.children : [];
|
| 9 |
+
return kids.map(extractText).join('');
|
| 10 |
+
};
|
| 11 |
+
const visit = (node) => {
|
| 12 |
+
if (!node || typeof node !== 'object') return;
|
| 13 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 14 |
+
if (node.type === 'element' && node.tagName === 'section') {
|
| 15 |
+
const className = node.properties?.className || [];
|
| 16 |
+
const classes = Array.isArray(className) ? className : [className].filter(Boolean);
|
| 17 |
+
if (classes.includes('code-output')) {
|
| 18 |
+
const meaningful = children.filter((c) => !(c.type === 'text' && isWhitespace(c.value)));
|
| 19 |
+
if (meaningful.length === 1) {
|
| 20 |
+
const only = meaningful[0];
|
| 21 |
+
const isPlainParagraph = only.type === 'element' && only.tagName === 'p' && (only.children || []).every((c) => c.type === 'text');
|
| 22 |
+
const isPlainText = only.type === 'text';
|
| 23 |
+
if (isPlainParagraph || isPlainText) {
|
| 24 |
+
const text = isPlainText ? String(only.value || '') : extractText(only);
|
| 25 |
+
node.children = [
|
| 26 |
+
{ type: 'element', tagName: 'pre', properties: {}, children: [ { type: 'text', value: text } ] }
|
| 27 |
+
];
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
children.forEach(visit);
|
| 33 |
+
};
|
| 34 |
+
visit(tree);
|
| 35 |
+
};
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
app/plugins/rehype/wrap-tables.mjs
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// rehype plugin: wrap bare <table> elements in a <div class="table-scroll"> container
|
| 2 |
+
// so that tables stay width:100% while enabling horizontal scroll when content overflows
|
| 3 |
+
|
| 4 |
+
export default function rehypeWrapTables() {
|
| 5 |
+
return (tree) => {
|
| 6 |
+
const isElement = (n) => n && typeof n === 'object' && n.type === 'element';
|
| 7 |
+
const getChildren = (n) => (Array.isArray(n?.children) ? n.children : []);
|
| 8 |
+
|
| 9 |
+
const walk = (node, parent, fn) => {
|
| 10 |
+
if (!node || typeof node !== 'object') return;
|
| 11 |
+
fn && fn(node, parent);
|
| 12 |
+
const kids = getChildren(node);
|
| 13 |
+
for (const child of kids) walk(child, node, fn);
|
| 14 |
+
};
|
| 15 |
+
|
| 16 |
+
const ensureArray = (v) => (Array.isArray(v) ? v : v != null ? [v] : []);
|
| 17 |
+
const hasClass = (el, name) => ensureArray(el?.properties?.className).map(String).includes(name);
|
| 18 |
+
|
| 19 |
+
const wrapTable = (tableNode, parent) => {
|
| 20 |
+
if (!parent || !Array.isArray(parent.children)) return;
|
| 21 |
+
// Don't double-wrap if already inside .table-scroll
|
| 22 |
+
if (parent.tagName === 'div' && hasClass(parent, 'table-scroll')) return;
|
| 23 |
+
|
| 24 |
+
const wrapper = {
|
| 25 |
+
type: 'element',
|
| 26 |
+
tagName: 'div',
|
| 27 |
+
properties: { className: ['table-scroll'] },
|
| 28 |
+
children: [tableNode]
|
| 29 |
+
};
|
| 30 |
+
|
| 31 |
+
const idx = parent.children.indexOf(tableNode);
|
| 32 |
+
if (idx >= 0) parent.children.splice(idx, 1, wrapper);
|
| 33 |
+
};
|
| 34 |
+
|
| 35 |
+
walk(tree, null, (node, parent) => {
|
| 36 |
+
if (!isElement(node)) return;
|
| 37 |
+
if (node.tagName !== 'table') return;
|
| 38 |
+
wrapTable(node, parent);
|
| 39 |
+
});
|
| 40 |
+
};
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
app/plugins/remark/ignore-citations-in-code.mjs
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Remark plugin to ignore citations inside code (block and inline)
|
| 2 |
+
export default function remarkIgnoreCitationsInCode() {
|
| 3 |
+
return (tree) => {
|
| 4 |
+
const visit = (node) => {
|
| 5 |
+
if (!node || typeof node !== 'object') return;
|
| 6 |
+
const type = node.type;
|
| 7 |
+
if (type === 'code' || type === 'inlineCode') {
|
| 8 |
+
if (typeof node.value === 'string' && node.value.includes('@')) {
|
| 9 |
+
// Use a sentinel to avoid rehype-citation, will be restored later in rehype
|
| 10 |
+
node.value = node.value.replace(/@/g, '__AT_SENTINEL__');
|
| 11 |
+
}
|
| 12 |
+
return; // do not traverse into code
|
| 13 |
+
}
|
| 14 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 15 |
+
children.forEach(visit);
|
| 16 |
+
};
|
| 17 |
+
visit(tree);
|
| 18 |
+
};
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
app/plugins/remark/output-container.mjs
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Transform `:::output ... :::` into a <section class="code-output"> wrapper
|
| 2 |
+
// Requires remark-directive to be applied before this plugin
|
| 3 |
+
|
| 4 |
+
export default function remarkOutputContainer() {
|
| 5 |
+
return (tree) => {
|
| 6 |
+
const visit = (node) => {
|
| 7 |
+
if (!node || typeof node !== 'object') return;
|
| 8 |
+
|
| 9 |
+
if (node.type === 'containerDirective' && node.name === 'output') {
|
| 10 |
+
node.data = node.data || {};
|
| 11 |
+
node.data.hName = 'section';
|
| 12 |
+
node.data.hProperties = { className: ['code-output'] };
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 16 |
+
for (const child of children) visit(child);
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
visit(tree);
|
| 20 |
+
};
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
app/plugins/remark/outputs-container.mjs
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Transform `:::outputs ... :::` into a <section class="code-outputs"> wrapper
|
| 2 |
+
// Requires remark-directive to be applied before this plugin
|
| 3 |
+
|
| 4 |
+
export default function remarkOutputsContainer() {
|
| 5 |
+
return (tree) => {
|
| 6 |
+
const visit = (node) => {
|
| 7 |
+
if (!node || typeof node !== 'object') return;
|
| 8 |
+
|
| 9 |
+
if (node.type === 'containerDirective' && node.name === 'outputs') {
|
| 10 |
+
node.data = node.data || {};
|
| 11 |
+
node.data.hName = 'section';
|
| 12 |
+
node.data.hProperties = { className: ['code-outputs'] };
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 16 |
+
for (const child of children) visit(child);
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
visit(tree);
|
| 20 |
+
};
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
app/plugins/remark/unwrap-citation-links.mjs
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Plugin remark pour transformer les liens markdown contenant des citations en citations simples
|
| 2 |
+
// Transforme [@reference](url) en [@reference]
|
| 3 |
+
export default function remarkUnwrapCitationLinks() {
|
| 4 |
+
return (tree) => {
|
| 5 |
+
// Fonction helper pour extraire le contenu textuel d'un nœud
|
| 6 |
+
const getTextContent = (node) => {
|
| 7 |
+
if (!node) return '';
|
| 8 |
+
if (node.type === 'text') return node.value || '';
|
| 9 |
+
if (Array.isArray(node.children)) {
|
| 10 |
+
return node.children.map(getTextContent).join('');
|
| 11 |
+
}
|
| 12 |
+
return '';
|
| 13 |
+
};
|
| 14 |
+
|
| 15 |
+
const visit = (node, parent) => {
|
| 16 |
+
if (!node || typeof node !== 'object') return;
|
| 17 |
+
|
| 18 |
+
// Parcourir les enfants d'abord (post-order traversal)
|
| 19 |
+
const children = Array.isArray(node.children) ? node.children : [];
|
| 20 |
+
for (let i = 0; i < children.length; i++) {
|
| 21 |
+
const child = children[i];
|
| 22 |
+
visit(child, node);
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// Si c'est un nœud de type 'link', vérifier son contenu
|
| 26 |
+
if (node.type === 'link' && parent && Array.isArray(parent.children)) {
|
| 27 |
+
// Récupérer le contenu textuel du lien
|
| 28 |
+
const textContent = getTextContent(node);
|
| 29 |
+
|
| 30 |
+
// Debug
|
| 31 |
+
console.log('🔍 Link trouvé:', {
|
| 32 |
+
text: textContent,
|
| 33 |
+
url: node.url,
|
| 34 |
+
matches: /^@\w+/.test(textContent.trim())
|
| 35 |
+
});
|
| 36 |
+
|
| 37 |
+
// Vérifier si c'est une citation (commence par @)
|
| 38 |
+
if (textContent && /^@\w+/.test(textContent.trim())) {
|
| 39 |
+
// Trouver l'index du nœud dans son parent
|
| 40 |
+
const index = parent.children.indexOf(node);
|
| 41 |
+
|
| 42 |
+
if (index !== -1) {
|
| 43 |
+
console.log('✅ Transformation:', textContent);
|
| 44 |
+
// Remplacer le nœud link par un nœud text simple
|
| 45 |
+
parent.children[index] = {
|
| 46 |
+
type: 'text',
|
| 47 |
+
value: textContent.trim()
|
| 48 |
+
};
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
};
|
| 53 |
+
|
| 54 |
+
visit(tree, null);
|
| 55 |
+
};
|
| 56 |
+
}
|
| 57 |
+
|
app/postcss.config.mjs
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// PostCSS config enabling Custom Media Queries
|
| 2 |
+
// Allows usage of: @media (--bp-content-collapse) { ... }
|
| 3 |
+
|
| 4 |
+
import postcssCustomMedia from 'postcss-custom-media';
|
| 5 |
+
import postcssPresetEnv from 'postcss-preset-env';
|
| 6 |
+
|
| 7 |
+
export default {
|
| 8 |
+
plugins: [
|
| 9 |
+
postcssCustomMedia(),
|
| 10 |
+
postcssPresetEnv({
|
| 11 |
+
stage: 0
|
| 12 |
+
})
|
| 13 |
+
]
|
| 14 |
+
};
|
app/public/data
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
../src/content/assets/data
|
app/public/hf-space-parent-listener.js
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Script pour la fenêtre parente des Spaces Hugging Face
|
| 3 |
+
* Ce script écoute les messages de l'iframe et met à jour l'URL de la fenêtre parente
|
| 4 |
+
*
|
| 5 |
+
* Instructions d'utilisation :
|
| 6 |
+
* 1. Ajoutez ce script à votre Space Hugging Face dans le fichier app.py ou dans un composant Gradio
|
| 7 |
+
* 2. Ou utilisez-le dans une page HTML qui contient votre iframe
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
(function () {
|
| 11 |
+
'use strict';
|
| 12 |
+
|
| 13 |
+
console.log('HF Space Parent Listener initialized');
|
| 14 |
+
|
| 15 |
+
// Écouter les messages de l'iframe
|
| 16 |
+
window.addEventListener('message', function (event) {
|
| 17 |
+
console.log('Received message from iframe:', event.data);
|
| 18 |
+
|
| 19 |
+
// Vérifier le type de message
|
| 20 |
+
if (event.data && event.data.type) {
|
| 21 |
+
switch (event.data.type) {
|
| 22 |
+
case 'urlChange':
|
| 23 |
+
case 'anchorChange':
|
| 24 |
+
case 'HF_SPACE_URL_UPDATE':
|
| 25 |
+
handleUrlChange(event.data);
|
| 26 |
+
break;
|
| 27 |
+
default:
|
| 28 |
+
console.log('Unknown message type:', event.data.type);
|
| 29 |
+
}
|
| 30 |
+
}
|
| 31 |
+
});
|
| 32 |
+
|
| 33 |
+
function handleUrlChange(data) {
|
| 34 |
+
try {
|
| 35 |
+
const hash = data.hash || data.anchorId;
|
| 36 |
+
const url = data.url;
|
| 37 |
+
|
| 38 |
+
if (hash) {
|
| 39 |
+
// Mettre à jour l'URL avec la nouvelle ancre
|
| 40 |
+
const newUrl = new URL(window.location);
|
| 41 |
+
newUrl.hash = hash;
|
| 42 |
+
|
| 43 |
+
// Utiliser replaceState pour éviter d'ajouter une entrée dans l'historique
|
| 44 |
+
window.history.replaceState(null, '', newUrl.toString());
|
| 45 |
+
|
| 46 |
+
console.log('Updated parent URL to:', newUrl.toString());
|
| 47 |
+
|
| 48 |
+
// Optionnel : faire défiler vers l'élément correspondant dans la page parente
|
| 49 |
+
const targetElement = document.querySelector(hash);
|
| 50 |
+
if (targetElement) {
|
| 51 |
+
targetElement.scrollIntoView({ behavior: 'smooth', block: 'start' });
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
} catch (error) {
|
| 55 |
+
console.error('Error updating parent URL:', error);
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// Fonction utilitaire pour tester la communication
|
| 60 |
+
window.testIframeCommunication = function () {
|
| 61 |
+
console.log('Testing iframe communication...');
|
| 62 |
+
const iframe = document.querySelector('iframe');
|
| 63 |
+
if (iframe) {
|
| 64 |
+
iframe.contentWindow.postMessage({ type: 'test' }, '*');
|
| 65 |
+
} else {
|
| 66 |
+
console.log('No iframe found');
|
| 67 |
+
}
|
| 68 |
+
};
|
| 69 |
+
|
| 70 |
+
})();
|
app/public/scripts/color-palettes.js
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Global color palettes generator and watcher
|
| 2 |
+
// - Observes CSS variable --primary-color and theme changes
|
| 3 |
+
// - Generates categorical, sequential, and diverging palettes (OKLCH/OKLab)
|
| 4 |
+
// - Exposes results as CSS variables on :root
|
| 5 |
+
// - Supports variable color counts per palette via CSS vars
|
| 6 |
+
// - Dispatches a 'palettes:updated' CustomEvent after each update
|
| 7 |
+
|
| 8 |
+
(() => {
|
| 9 |
+
const MODE = { cssRoot: document.documentElement };
|
| 10 |
+
|
| 11 |
+
const getCssVar = (name) => {
|
| 12 |
+
try { return getComputedStyle(MODE.cssRoot).getPropertyValue(name).trim(); } catch { return ''; }
|
| 13 |
+
};
|
| 14 |
+
const getIntFromCssVar = (name, fallback) => {
|
| 15 |
+
const raw = getCssVar(name);
|
| 16 |
+
if (!raw) return fallback;
|
| 17 |
+
const v = parseInt(String(raw), 10);
|
| 18 |
+
if (Number.isNaN(v)) return fallback;
|
| 19 |
+
return v;
|
| 20 |
+
};
|
| 21 |
+
const clamp = (n, min, max) => Math.max(min, Math.min(max, n));
|
| 22 |
+
|
| 23 |
+
// Color math (OKLab/OKLCH)
|
| 24 |
+
const srgbToLinear = (u) => (u <= 0.04045 ? u / 12.92 : Math.pow((u + 0.055) / 1.055, 2.4));
|
| 25 |
+
const linearToSrgb = (u) => (u <= 0.0031308 ? 12.92 * u : 1.055 * Math.pow(Math.max(0, u), 1 / 2.4) - 0.055);
|
| 26 |
+
const rgbToOklab = (r, g, b) => {
|
| 27 |
+
const rl = srgbToLinear(r), gl = srgbToLinear(g), bl = srgbToLinear(b);
|
| 28 |
+
const l = Math.cbrt(0.4122214708 * rl + 0.5363325363 * gl + 0.0514459929 * bl);
|
| 29 |
+
const m = Math.cbrt(0.2119034982 * rl + 0.6806995451 * gl + 0.1073969566 * bl);
|
| 30 |
+
const s = Math.cbrt(0.0883024619 * rl + 0.2817188376 * gl + 0.6299787005 * bl);
|
| 31 |
+
const L = 0.2104542553 * l + 0.7936177850 * m - 0.0040720468 * s;
|
| 32 |
+
const a = 1.9779984951 * l - 2.4285922050 * m + 0.4505937099 * s;
|
| 33 |
+
const b2 = 0.0259040371 * l + 0.7827717662 * m - 0.8086757660 * s;
|
| 34 |
+
return { L, a, b: b2 };
|
| 35 |
+
};
|
| 36 |
+
const oklabToRgb = (L, a, b) => {
|
| 37 |
+
const l_ = L + 0.3963377774 * a + 0.2158037573 * b;
|
| 38 |
+
const m_ = L - 0.1055613458 * a - 0.0638541728 * b;
|
| 39 |
+
const s_ = L - 0.0894841775 * a - 1.2914855480 * b;
|
| 40 |
+
const l = l_ * l_ * l_;
|
| 41 |
+
const m = m_ * m_ * m_;
|
| 42 |
+
const s = s_ * s_ * s_;
|
| 43 |
+
const r = linearToSrgb(+4.0767416621 * l - 3.3077115913 * m + 0.2309699292 * s);
|
| 44 |
+
const g = linearToSrgb(-1.2684380046 * l + 2.6097574011 * m - 0.3413193965 * s);
|
| 45 |
+
const b3 = linearToSrgb(-0.0041960863 * l - 0.7034186147 * m + 1.7076147010 * s);
|
| 46 |
+
return { r, g, b: b3 };
|
| 47 |
+
};
|
| 48 |
+
const oklchToOklab = (L, C, hDeg) => { const h = (hDeg * Math.PI) / 180; return { L, a: C * Math.cos(h), b: C * Math.sin(h) }; };
|
| 49 |
+
const oklabToOklch = (L, a, b) => { const C = Math.sqrt(a * a + b * b); let h = Math.atan2(b, a) * 180 / Math.PI; if (h < 0) h += 360; return { L, C, h }; };
|
| 50 |
+
const clamp01 = (x) => Math.min(1, Math.max(0, x));
|
| 51 |
+
const isInGamut = ({ r, g, b }) => r >= 0 && r <= 1 && g >= 0 && g <= 1 && b >= 0 && b <= 1;
|
| 52 |
+
const toHex = ({ r, g, b }) => {
|
| 53 |
+
const R = Math.round(clamp01(r) * 255), G = Math.round(clamp01(g) * 255), B = Math.round(clamp01(b) * 255);
|
| 54 |
+
const h = (n) => n.toString(16).padStart(2, '0');
|
| 55 |
+
return `#${h(R)}${h(G)}${h(B)}`.toUpperCase();
|
| 56 |
+
};
|
| 57 |
+
const oklchToHexSafe = (L, C, h) => { let c = C; for (let i = 0; i < 12; i++) { const { a, b } = oklchToOklab(L, c, h); const rgb = oklabToRgb(L, a, b); if (isInGamut(rgb)) return toHex(rgb); c = Math.max(0, c - 0.02); } return toHex(oklabToRgb(L, 0, 0)); };
|
| 58 |
+
const parseCssColorToRgb = (css) => { try { const el = document.createElement('span'); el.style.color = css; document.body.appendChild(el); const cs = getComputedStyle(el).color; document.body.removeChild(el); const m = cs.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/i); if (!m) return null; return { r: Number(m[1]) / 255, g: Number(m[2]) / 255, b: Number(m[3]) / 255 }; } catch { return null; } };
|
| 59 |
+
|
| 60 |
+
// Get primary color in OKLCH format to preserve precision
|
| 61 |
+
const getPrimaryOKLCH = () => {
|
| 62 |
+
const css = getCssVar('--primary-color');
|
| 63 |
+
if (!css) return null;
|
| 64 |
+
|
| 65 |
+
// For OKLCH colors, return the exact values without conversion
|
| 66 |
+
if (css.includes('oklch')) {
|
| 67 |
+
const oklchMatch = css.match(/oklch\(([^)]+)\)/);
|
| 68 |
+
if (oklchMatch) {
|
| 69 |
+
const values = oklchMatch[1].split(/\s+/).map(v => parseFloat(v.trim()));
|
| 70 |
+
if (values.length >= 3) {
|
| 71 |
+
const [L, C, h] = values;
|
| 72 |
+
return { L, C, h };
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
// For non-OKLCH colors, convert to OKLCH for consistency
|
| 78 |
+
const rgb = parseCssColorToRgb(css);
|
| 79 |
+
if (rgb) {
|
| 80 |
+
const { L, a, b } = rgbToOklab(rgb.r, rgb.g, rgb.b);
|
| 81 |
+
const { C, h } = oklabToOklch(L, a, b);
|
| 82 |
+
return { L, C, h };
|
| 83 |
+
}
|
| 84 |
+
return null;
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
// Keep getPrimaryHex for backward compatibility, but now it converts from OKLCH
|
| 88 |
+
const getPrimaryHex = () => {
|
| 89 |
+
const oklch = getPrimaryOKLCH();
|
| 90 |
+
if (!oklch) return null;
|
| 91 |
+
|
| 92 |
+
const { a, b } = oklchToOklab(oklch.L, oklch.C, oklch.h);
|
| 93 |
+
const rgb = oklabToRgb(oklch.L, a, b);
|
| 94 |
+
return toHex(rgb);
|
| 95 |
+
};
|
| 96 |
+
// No count management via CSS anymore; counts are passed directly to the API
|
| 97 |
+
|
| 98 |
+
const generators = {
|
| 99 |
+
categorical: (baseOKLCH, count) => {
|
| 100 |
+
const { L, C, h } = baseOKLCH;
|
| 101 |
+
const L0 = Math.min(0.85, Math.max(0.4, L));
|
| 102 |
+
const C0 = Math.min(0.35, Math.max(0.1, C || 0.2));
|
| 103 |
+
const total = Math.max(1, Math.min(12, count || 8));
|
| 104 |
+
const hueStep = 360 / total;
|
| 105 |
+
const results = [];
|
| 106 |
+
for (let i = 0; i < total; i++) {
|
| 107 |
+
const hDeg = (h + i * hueStep) % 360;
|
| 108 |
+
const lVar = ((i % 3) - 1) * 0.04;
|
| 109 |
+
results.push(oklchToHexSafe(Math.max(0.4, Math.min(0.85, L0 + lVar)), C0, hDeg));
|
| 110 |
+
}
|
| 111 |
+
return results;
|
| 112 |
+
},
|
| 113 |
+
sequential: (baseOKLCH, count) => {
|
| 114 |
+
const { L, C, h } = baseOKLCH;
|
| 115 |
+
const total = Math.max(1, Math.min(12, count || 8));
|
| 116 |
+
const startL = Math.max(0.25, L - 0.18);
|
| 117 |
+
const endL = Math.min(0.92, L + 0.18);
|
| 118 |
+
const cBase = Math.min(0.33, Math.max(0.08, C * 0.9 + 0.06));
|
| 119 |
+
const out = [];
|
| 120 |
+
for (let i = 0; i < total; i++) {
|
| 121 |
+
const t = total === 1 ? 0 : i / (total - 1);
|
| 122 |
+
const lNow = startL * (1 - t) + endL * t;
|
| 123 |
+
const cNow = cBase * (0.85 + 0.15 * (1 - Math.abs(0.5 - t) * 2));
|
| 124 |
+
out.push(oklchToHexSafe(lNow, cNow, h));
|
| 125 |
+
}
|
| 126 |
+
return out;
|
| 127 |
+
},
|
| 128 |
+
diverging: (baseOKLCH, count) => {
|
| 129 |
+
const { L, C, h } = baseOKLCH;
|
| 130 |
+
const total = Math.max(1, Math.min(12, count || 8));
|
| 131 |
+
|
| 132 |
+
// Left endpoint: EXACT primary color (no darkening)
|
| 133 |
+
const leftLab = oklchToOklab(L, C, h);
|
| 134 |
+
// Right endpoint: complement with same L and similar C (clamped safe)
|
| 135 |
+
const compH = (h + 180) % 360;
|
| 136 |
+
const cSafe = Math.min(0.35, Math.max(0.08, C));
|
| 137 |
+
const rightLab = oklchToOklab(L, cSafe, compH);
|
| 138 |
+
const whiteLab = { L: 0.98, a: 0, b: 0 }; // center near‑white
|
| 139 |
+
|
| 140 |
+
const hexFromOKLab = (L, a, b) => toHex(oklabToRgb(L, a, b));
|
| 141 |
+
const lerp = (a, b, t) => a + (b - a) * t;
|
| 142 |
+
const lerpOKLabHex = (A, B, t) => hexFromOKLab(lerp(A.L, B.L, t), lerp(A.a, B.a, t), lerp(A.b, B.b, t));
|
| 143 |
+
|
| 144 |
+
const out = [];
|
| 145 |
+
if (total % 2 === 1) {
|
| 146 |
+
const nSide = (total - 1) >> 1; // items on each side
|
| 147 |
+
// Left side: include left endpoint exactly at index 0
|
| 148 |
+
for (let i = 0; i < nSide; i++) {
|
| 149 |
+
const t = nSide <= 1 ? 0 : (i / (nSide - 1)); // 0 .. 1
|
| 150 |
+
// Move from leftLab to a value close (but not equal) to white; ensure last before center is lighter
|
| 151 |
+
const tt = t * 0.9; // keep some distance from pure white before center
|
| 152 |
+
out.push(lerpOKLabHex(leftLab, whiteLab, tt));
|
| 153 |
+
}
|
| 154 |
+
// Center
|
| 155 |
+
out.push(hexFromOKLab(whiteLab.L, whiteLab.a, whiteLab.b));
|
| 156 |
+
// Right side: start near white and end EXACTLY at rightLab
|
| 157 |
+
for (let i = 0; i < nSide; i++) {
|
| 158 |
+
const t = nSide <= 1 ? 1 : ((i + 1) / nSide); // (1/n)..1
|
| 159 |
+
const tt = Math.max(0.1, t); // avoid starting at pure white
|
| 160 |
+
out.push(lerpOKLabHex(whiteLab, rightLab, tt));
|
| 161 |
+
}
|
| 162 |
+
// Ensure first and last are exact endpoints
|
| 163 |
+
if (out.length) { out[0] = hexFromOKLab(leftLab.L, leftLab.a, leftLab.b); out[out.length - 1] = hexFromOKLab(rightLab.L, rightLab.a, rightLab.b); }
|
| 164 |
+
} else {
|
| 165 |
+
const nSide = total >> 1;
|
| 166 |
+
// Left half including left endpoint, approaching white but not reaching it
|
| 167 |
+
for (let i = 0; i < nSide; i++) {
|
| 168 |
+
const t = nSide <= 1 ? 0 : (i / (nSide - 1)); // 0 .. 1
|
| 169 |
+
const tt = t * 0.9;
|
| 170 |
+
out.push(lerpOKLabHex(leftLab, whiteLab, tt));
|
| 171 |
+
}
|
| 172 |
+
// Right half: mirror from near white to exact right endpoint
|
| 173 |
+
for (let i = 0; i < nSide; i++) {
|
| 174 |
+
const t = nSide <= 1 ? 1 : ((i + 1) / nSide); // (1/n)..1
|
| 175 |
+
const tt = Math.max(0.1, t);
|
| 176 |
+
out.push(lerpOKLabHex(whiteLab, rightLab, tt));
|
| 177 |
+
}
|
| 178 |
+
if (out.length) { out[0] = hexFromOKLab(leftLab.L, leftLab.a, leftLab.b); out[out.length - 1] = hexFromOKLab(rightLab.L, rightLab.a, rightLab.b); }
|
| 179 |
+
}
|
| 180 |
+
return out;
|
| 181 |
+
}
|
| 182 |
+
};
|
| 183 |
+
|
| 184 |
+
let lastSignature = '';
|
| 185 |
+
|
| 186 |
+
const updatePalettes = () => {
|
| 187 |
+
const primaryOKLCH = getPrimaryOKLCH();
|
| 188 |
+
const primaryHex = getPrimaryHex();
|
| 189 |
+
const signature = `${primaryOKLCH?.L},${primaryOKLCH?.C},${primaryOKLCH?.h}`;
|
| 190 |
+
if (signature === lastSignature) return;
|
| 191 |
+
lastSignature = signature;
|
| 192 |
+
try { document.dispatchEvent(new CustomEvent('palettes:updated', { detail: { primary: primaryHex, primaryOKLCH } })); } catch { }
|
| 193 |
+
};
|
| 194 |
+
|
| 195 |
+
const bootstrap = () => {
|
| 196 |
+
// Initial setup - only run once on page load
|
| 197 |
+
updatePalettes();
|
| 198 |
+
|
| 199 |
+
// Observer will handle all subsequent changes
|
| 200 |
+
const mo = new MutationObserver(() => updatePalettes());
|
| 201 |
+
mo.observe(MODE.cssRoot, { attributes: true, attributeFilter: ['style', 'data-theme'] });
|
| 202 |
+
|
| 203 |
+
// Utility: choose high-contrast (or softened) text style against an arbitrary background color
|
| 204 |
+
const pickTextStyleForBackground = (bgCss, opts = {}) => {
|
| 205 |
+
const cssRoot = document.documentElement;
|
| 206 |
+
const getCssVar = (name) => {
|
| 207 |
+
try { return getComputedStyle(cssRoot).getPropertyValue(name).trim(); } catch { return ''; }
|
| 208 |
+
};
|
| 209 |
+
const resolveCssToRgb01 = (css) => {
|
| 210 |
+
const rgb = parseCssColorToRgb(css);
|
| 211 |
+
if (!rgb) return null;
|
| 212 |
+
return rgb; // already 0..1
|
| 213 |
+
};
|
| 214 |
+
const mixRgb01 = (a, b, t) => ({ r: a.r * (1 - t) + b.r * t, g: a.g * (1 - t) + b.g * t, b: a.b * (1 - t) + b.b * t });
|
| 215 |
+
const relLum = (rgb) => {
|
| 216 |
+
const f = (u) => srgbToLinear(u);
|
| 217 |
+
return 0.2126 * f(rgb.r) + 0.7152 * f(rgb.g) + 0.0722 * f(rgb.b);
|
| 218 |
+
};
|
| 219 |
+
const contrast = (fg, bg) => {
|
| 220 |
+
const L1 = relLum(fg), L2 = relLum(bg); const a = Math.max(L1, L2), b = Math.min(L1, L2);
|
| 221 |
+
return (a + 0.05) / (b + 0.05);
|
| 222 |
+
};
|
| 223 |
+
try {
|
| 224 |
+
const bg = resolveCssToRgb01(bgCss);
|
| 225 |
+
if (!bg) return { fill: getCssVar('--text-color') || '#000', stroke: 'var(--transparent-page-contrast)', strokeWidth: 1 };
|
| 226 |
+
const candidatesCss = [getCssVar('--text-color') || '#111', getCssVar('--on-primary') || '#0f1115', '#000', '#fff'];
|
| 227 |
+
const candidates = candidatesCss
|
| 228 |
+
.map(css => ({ css, rgb: resolveCssToRgb01(css) }))
|
| 229 |
+
.filter(x => !!x.rgb);
|
| 230 |
+
// Pick the max contrast
|
| 231 |
+
let best = candidates[0]; let bestCR = contrast(best.rgb, bg);
|
| 232 |
+
for (let i = 1; i < candidates.length; i++) {
|
| 233 |
+
const cr = contrast(candidates[i].rgb, bg);
|
| 234 |
+
if (cr > bestCR) { best = candidates[i]; bestCR = cr; }
|
| 235 |
+
}
|
| 236 |
+
// Optional softening via blend factor (0..1), blending towards muted color
|
| 237 |
+
const blend = Math.min(1, Math.max(0, Number(opts.blend || 0)));
|
| 238 |
+
let finalRgb = best.rgb;
|
| 239 |
+
if (blend > 0) {
|
| 240 |
+
const mutedCss = getCssVar('--muted-color') || (getCssVar('--text-color') || '#111');
|
| 241 |
+
const mutedRgb = resolveCssToRgb01(mutedCss) || best.rgb;
|
| 242 |
+
finalRgb = mixRgb01(best.rgb, mutedRgb, blend);
|
| 243 |
+
}
|
| 244 |
+
const haloStrength = Math.min(1, Math.max(0, Number(opts.haloStrength == null ? 0.5 : opts.haloStrength)));
|
| 245 |
+
const stroke = (best.css === '#000' || best.css.toLowerCase() === 'black') ? `rgba(255,255,255,${0.30 + 0.40 * haloStrength})` : `rgba(0,0,0,${0.30 + 0.30 * haloStrength})`;
|
| 246 |
+
return { fill: toHex(finalRgb), stroke, strokeWidth: (opts.haloWidth == null ? 1 : Number(opts.haloWidth)) };
|
| 247 |
+
} catch {
|
| 248 |
+
return { fill: getCssVar('--text-color') || '#000', stroke: 'var(--transparent-page-contrast)', strokeWidth: 1 };
|
| 249 |
+
}
|
| 250 |
+
};
|
| 251 |
+
window.ColorPalettes = {
|
| 252 |
+
refresh: updatePalettes,
|
| 253 |
+
notify: () => { try { const primaryOKLCH = getPrimaryOKLCH(); const primaryHex = getPrimaryHex(); document.dispatchEvent(new CustomEvent('palettes:updated', { detail: { primary: primaryHex, primaryOKLCH } })); } catch { } },
|
| 254 |
+
getPrimary: () => getPrimaryHex(),
|
| 255 |
+
getPrimaryOKLCH: () => getPrimaryOKLCH(),
|
| 256 |
+
getColors: (key, count = 6) => {
|
| 257 |
+
const primaryOKLCH = getPrimaryOKLCH();
|
| 258 |
+
if (!primaryOKLCH) return [];
|
| 259 |
+
const total = Math.max(1, Math.min(12, Number(count) || 6));
|
| 260 |
+
if (key === 'categorical') return generators.categorical(primaryOKLCH, total);
|
| 261 |
+
if (key === 'sequential') return generators.sequential(primaryOKLCH, total);
|
| 262 |
+
if (key === 'diverging') return generators.diverging(primaryOKLCH, total);
|
| 263 |
+
return [];
|
| 264 |
+
},
|
| 265 |
+
getTextStyleForBackground: (bgCss, opts) => pickTextStyleForBackground(bgCss, opts || {}),
|
| 266 |
+
chooseReadableText: (bgCss, opts) => pickTextStyleForBackground(bgCss, opts || {})
|
| 267 |
+
};
|
| 268 |
+
};
|
| 269 |
+
|
| 270 |
+
if (document.readyState === 'loading') document.addEventListener('DOMContentLoaded', bootstrap, { once: true });
|
| 271 |
+
else bootstrap();
|
| 272 |
+
})();
|
| 273 |
+
|
| 274 |
+
|
app/scripts/export-latex.mjs
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
import { spawn } from 'node:child_process';
|
| 3 |
+
import { promises as fs } from 'node:fs';
|
| 4 |
+
import { resolve, dirname, basename, extname } from 'node:path';
|
| 5 |
+
import process from 'node:process';
|
| 6 |
+
|
| 7 |
+
async function run(command, args = [], options = {}) {
|
| 8 |
+
return new Promise((resolvePromise, reject) => {
|
| 9 |
+
const child = spawn(command, args, { stdio: 'inherit', shell: false, ...options });
|
| 10 |
+
child.on('error', reject);
|
| 11 |
+
child.on('exit', (code) => {
|
| 12 |
+
if (code === 0) resolvePromise(undefined);
|
| 13 |
+
else reject(new Error(`${command} ${args.join(' ')} exited with code ${code}`));
|
| 14 |
+
});
|
| 15 |
+
});
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
function parseArgs(argv) {
|
| 19 |
+
const out = {};
|
| 20 |
+
for (const arg of argv.slice(2)) {
|
| 21 |
+
if (!arg.startsWith('--')) continue;
|
| 22 |
+
const [k, v] = arg.replace(/^--/, '').split('=');
|
| 23 |
+
out[k] = v === undefined ? true : v;
|
| 24 |
+
}
|
| 25 |
+
return out;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
function slugify(text) {
|
| 29 |
+
return String(text || '')
|
| 30 |
+
.normalize('NFKD')
|
| 31 |
+
.replace(/\p{Diacritic}+/gu, '')
|
| 32 |
+
.toLowerCase()
|
| 33 |
+
.replace(/[^a-z0-9]+/g, '-')
|
| 34 |
+
.replace(/^-+|-+$/g, '')
|
| 35 |
+
.slice(0, 120) || 'article';
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
async function checkPandocInstalled() {
|
| 39 |
+
try {
|
| 40 |
+
await run('pandoc', ['--version'], { stdio: 'pipe' });
|
| 41 |
+
return true;
|
| 42 |
+
} catch {
|
| 43 |
+
return false;
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
async function readMdxFile(filePath) {
|
| 48 |
+
try {
|
| 49 |
+
const content = await fs.readFile(filePath, 'utf-8');
|
| 50 |
+
return content;
|
| 51 |
+
} catch (error) {
|
| 52 |
+
console.warn(`Warning: Could not read ${filePath}:`, error.message);
|
| 53 |
+
return '';
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
function extractFrontmatter(content) {
|
| 58 |
+
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---\n/);
|
| 59 |
+
if (!frontmatterMatch) return { frontmatter: {}, content };
|
| 60 |
+
|
| 61 |
+
const frontmatterText = frontmatterMatch[1];
|
| 62 |
+
const contentWithoutFrontmatter = content.replace(frontmatterMatch[0], '');
|
| 63 |
+
|
| 64 |
+
// More robust YAML parsing that handles complex structures
|
| 65 |
+
const frontmatter = {};
|
| 66 |
+
const lines = frontmatterText.split('\n');
|
| 67 |
+
let currentKey = null;
|
| 68 |
+
let currentValue = '';
|
| 69 |
+
let inMultiLineValue = false;
|
| 70 |
+
let multiLineOperator = null; // '>' or '|'
|
| 71 |
+
|
| 72 |
+
for (const line of lines) {
|
| 73 |
+
// Check if this is a new key
|
| 74 |
+
if (line.match(/^[a-zA-Z_][a-zA-Z0-9_]*\s*:/) && !inMultiLineValue) {
|
| 75 |
+
// Save previous key if exists
|
| 76 |
+
if (currentKey) {
|
| 77 |
+
frontmatter[currentKey] = currentValue.trim();
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
const [key, ...valueParts] = line.split(':');
|
| 81 |
+
currentKey = key.trim();
|
| 82 |
+
currentValue = valueParts.join(':').trim();
|
| 83 |
+
|
| 84 |
+
// Check for multi-line operators
|
| 85 |
+
if (currentValue.endsWith('>') || currentValue.endsWith('|')) {
|
| 86 |
+
multiLineOperator = currentValue.slice(-1);
|
| 87 |
+
currentValue = currentValue.slice(0, -1).trim();
|
| 88 |
+
inMultiLineValue = true;
|
| 89 |
+
} else if (currentValue) {
|
| 90 |
+
inMultiLineValue = false;
|
| 91 |
+
} else {
|
| 92 |
+
inMultiLineValue = true;
|
| 93 |
+
}
|
| 94 |
+
} else if (currentKey && (inMultiLineValue || line.match(/^\s/))) {
|
| 95 |
+
// Continuation line or nested content
|
| 96 |
+
if (inMultiLineValue) {
|
| 97 |
+
if (line.trim() === '' && multiLineOperator === '>') {
|
| 98 |
+
// Empty line in folded style should become space
|
| 99 |
+
currentValue += ' ';
|
| 100 |
+
} else {
|
| 101 |
+
const lineContent = line.startsWith(' ') ? line : ' ' + line;
|
| 102 |
+
currentValue += lineContent;
|
| 103 |
+
}
|
| 104 |
+
} else {
|
| 105 |
+
currentValue += '\n' + line;
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
// Save the last key
|
| 111 |
+
if (currentKey) {
|
| 112 |
+
frontmatter[currentKey] = currentValue.trim();
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
return { frontmatter, content: contentWithoutFrontmatter };
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
function cleanMdxToMarkdown(content) {
|
| 119 |
+
// Remove import statements
|
| 120 |
+
content = content.replace(/^import .+?;?\s*$/gm, '');
|
| 121 |
+
|
| 122 |
+
// Remove JSX component calls like <ComponentName />
|
| 123 |
+
content = content.replace(/<[A-Z][a-zA-Z0-9]*\s*\/>/g, '');
|
| 124 |
+
|
| 125 |
+
// Convert JSX components to simpler markdown
|
| 126 |
+
// Handle Sidenote components specially
|
| 127 |
+
content = content.replace(/<Sidenote>([\s\S]*?)<\/Sidenote>/g, (match, innerContent) => {
|
| 128 |
+
// Extract main content and aside content
|
| 129 |
+
const asideMatch = innerContent.match(/<Fragment slot="aside">([\s\S]*?)<\/Fragment>/);
|
| 130 |
+
const mainContent = innerContent.replace(/<Fragment slot="aside">[\s\S]*?<\/Fragment>/, '').trim();
|
| 131 |
+
const asideContent = asideMatch ? asideMatch[1].trim() : '';
|
| 132 |
+
|
| 133 |
+
let result = mainContent;
|
| 134 |
+
if (asideContent) {
|
| 135 |
+
result += `\n\n> **Note:** ${asideContent}`;
|
| 136 |
+
}
|
| 137 |
+
return result;
|
| 138 |
+
});
|
| 139 |
+
|
| 140 |
+
// Handle Note components
|
| 141 |
+
content = content.replace(/<Note[^>]*>([\s\S]*?)<\/Note>/g, (match, innerContent) => {
|
| 142 |
+
return `\n> **Note:** ${innerContent.trim()}\n`;
|
| 143 |
+
});
|
| 144 |
+
|
| 145 |
+
// Handle Wide and FullWidth components
|
| 146 |
+
content = content.replace(/<(Wide|FullWidth)>([\s\S]*?)<\/\1>/g, '$2');
|
| 147 |
+
|
| 148 |
+
// Handle HtmlEmbed components (convert to simple text)
|
| 149 |
+
content = content.replace(/<HtmlEmbed[^>]*\/>/g, '*[Interactive content not available in LaTeX]*');
|
| 150 |
+
|
| 151 |
+
// Remove remaining JSX fragments
|
| 152 |
+
content = content.replace(/<Fragment[^>]*>([\s\S]*?)<\/Fragment>/g, '$1');
|
| 153 |
+
content = content.replace(/<[A-Z][a-zA-Z0-9]*[^>]*>([\s\S]*?)<\/[A-Z][a-zA-Z0-9]*>/g, '$1');
|
| 154 |
+
|
| 155 |
+
// Clean up className attributes
|
| 156 |
+
content = content.replace(/className="[^"]*"/g, '');
|
| 157 |
+
|
| 158 |
+
// Clean up extra whitespace
|
| 159 |
+
content = content.replace(/\n{3,}/g, '\n\n');
|
| 160 |
+
|
| 161 |
+
// Clean up characters that might cause YAML parsing issues
|
| 162 |
+
// Remove any potential YAML-style markers that might interfere
|
| 163 |
+
content = content.replace(/^---$/gm, '');
|
| 164 |
+
content = content.replace(/^\s*&\s+/gm, ''); // Remove YAML aliases
|
| 165 |
+
|
| 166 |
+
return content.trim();
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
async function processChapterImports(content, contentDir) {
|
| 170 |
+
let processedContent = content;
|
| 171 |
+
|
| 172 |
+
// First, extract all import statements and their corresponding component calls
|
| 173 |
+
const importPattern = /import\s+(\w+)\s+from\s+["']\.\/chapters\/([^"']+)["'];?/g;
|
| 174 |
+
const imports = new Map();
|
| 175 |
+
let match;
|
| 176 |
+
|
| 177 |
+
// Collect all imports
|
| 178 |
+
while ((match = importPattern.exec(content)) !== null) {
|
| 179 |
+
const [fullImport, componentName, chapterPath] = match;
|
| 180 |
+
imports.set(componentName, { path: chapterPath, importStatement: fullImport });
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
// Remove all import statements
|
| 184 |
+
processedContent = processedContent.replace(importPattern, '');
|
| 185 |
+
|
| 186 |
+
// Process each component call
|
| 187 |
+
for (const [componentName, { path: chapterPath }] of imports) {
|
| 188 |
+
const componentCallPattern = new RegExp(`<${componentName}\\s*\\/>`, 'g');
|
| 189 |
+
|
| 190 |
+
try {
|
| 191 |
+
const chapterFile = resolve(contentDir, 'chapters', chapterPath);
|
| 192 |
+
const chapterContent = await readMdxFile(chapterFile);
|
| 193 |
+
const { content: chapterMarkdown } = extractFrontmatter(chapterContent);
|
| 194 |
+
const cleanChapter = cleanMdxToMarkdown(chapterMarkdown);
|
| 195 |
+
|
| 196 |
+
processedContent = processedContent.replace(componentCallPattern, cleanChapter);
|
| 197 |
+
console.log(`✅ Processed chapter: ${chapterPath}`);
|
| 198 |
+
} catch (error) {
|
| 199 |
+
console.warn(`Warning: Could not process chapter ${chapterPath}:`, error.message);
|
| 200 |
+
processedContent = processedContent.replace(componentCallPattern, `\n*[Chapter ${chapterPath} could not be loaded]*\n`);
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
return processedContent;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
function createLatexPreamble(frontmatter) {
|
| 208 |
+
const title = frontmatter.title ? frontmatter.title.replace(/\n/g, ' ') : 'Untitled Article';
|
| 209 |
+
const subtitle = frontmatter.subtitle || '';
|
| 210 |
+
const authors = frontmatter.authors || '';
|
| 211 |
+
const date = frontmatter.published || '';
|
| 212 |
+
|
| 213 |
+
return `\\documentclass[11pt,a4paper]{article}
|
| 214 |
+
\\usepackage[utf8]{inputenc}
|
| 215 |
+
\\usepackage[T1]{fontenc}
|
| 216 |
+
\\usepackage{amsmath,amsfonts,amssymb}
|
| 217 |
+
\\usepackage{graphicx}
|
| 218 |
+
\\usepackage{hyperref}
|
| 219 |
+
\\usepackage{booktabs}
|
| 220 |
+
\\usepackage{longtable}
|
| 221 |
+
\\usepackage{array}
|
| 222 |
+
\\usepackage{multirow}
|
| 223 |
+
\\usepackage{wrapfig}
|
| 224 |
+
\\usepackage{float}
|
| 225 |
+
\\usepackage{colortbl}
|
| 226 |
+
\\usepackage{pdflscape}
|
| 227 |
+
\\usepackage{tabu}
|
| 228 |
+
\\usepackage{threeparttable}
|
| 229 |
+
\\usepackage{threeparttablex}
|
| 230 |
+
\\usepackage{ulem}
|
| 231 |
+
\\usepackage{makecell}
|
| 232 |
+
\\usepackage{xcolor}
|
| 233 |
+
\\usepackage{listings}
|
| 234 |
+
\\usepackage{fancyvrb}
|
| 235 |
+
\\usepackage{geometry}
|
| 236 |
+
\\geometry{margin=1in}
|
| 237 |
+
|
| 238 |
+
\\title{${title}${subtitle ? `\\\\\\large ${subtitle}` : ''}}
|
| 239 |
+
${authors ? `\\author{${authors}}` : ''}
|
| 240 |
+
${date ? `\\date{${date}}` : ''}
|
| 241 |
+
|
| 242 |
+
\\begin{document}
|
| 243 |
+
\\maketitle
|
| 244 |
+
\\tableofcontents
|
| 245 |
+
\\newpage
|
| 246 |
+
|
| 247 |
+
`;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
async function main() {
|
| 251 |
+
const cwd = process.cwd();
|
| 252 |
+
const args = parseArgs(process.argv);
|
| 253 |
+
|
| 254 |
+
// Check if pandoc is installed
|
| 255 |
+
const hasPandoc = await checkPandocInstalled();
|
| 256 |
+
if (!hasPandoc) {
|
| 257 |
+
console.error('❌ Pandoc is not installed. Please install it first:');
|
| 258 |
+
console.error(' macOS: brew install pandoc');
|
| 259 |
+
console.error(' Ubuntu: apt-get install pandoc');
|
| 260 |
+
console.error(' Windows: choco install pandoc');
|
| 261 |
+
process.exit(1);
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
const contentDir = resolve(cwd, 'src/content');
|
| 265 |
+
const articleFile = resolve(contentDir, 'article.mdx');
|
| 266 |
+
|
| 267 |
+
// Check if article.mdx exists
|
| 268 |
+
try {
|
| 269 |
+
await fs.access(articleFile);
|
| 270 |
+
} catch {
|
| 271 |
+
console.error(`❌ Could not find article.mdx at ${articleFile}`);
|
| 272 |
+
process.exit(1);
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
console.log('> Reading article content...');
|
| 276 |
+
const articleContent = await readMdxFile(articleFile);
|
| 277 |
+
const { frontmatter, content } = extractFrontmatter(articleContent);
|
| 278 |
+
|
| 279 |
+
console.log('> Processing chapters...');
|
| 280 |
+
const processedContent = await processChapterImports(content, contentDir);
|
| 281 |
+
|
| 282 |
+
console.log('> Converting MDX to Markdown...');
|
| 283 |
+
const markdownContent = cleanMdxToMarkdown(processedContent);
|
| 284 |
+
|
| 285 |
+
// Generate output filename
|
| 286 |
+
const title = frontmatter.title ? frontmatter.title.replace(/\n/g, ' ') : 'article';
|
| 287 |
+
const outFileBase = args.filename ? String(args.filename).replace(/\.(tex|pdf)$/i, '') : slugify(title);
|
| 288 |
+
|
| 289 |
+
// Create temporary markdown file (ensure it's pure markdown without YAML frontmatter)
|
| 290 |
+
const tempMdFile = resolve(cwd, 'temp-article.md');
|
| 291 |
+
|
| 292 |
+
// Clean the markdown content to ensure no YAML frontmatter remains
|
| 293 |
+
let cleanMarkdown = markdownContent;
|
| 294 |
+
// Remove any potential YAML frontmatter that might have leaked through
|
| 295 |
+
cleanMarkdown = cleanMarkdown.replace(/^---\n[\s\S]*?\n---\n/, '');
|
| 296 |
+
// Remove any standalone YAML blocks that might cause issues
|
| 297 |
+
cleanMarkdown = cleanMarkdown.replace(/^---\n([\s\S]*?)\n---$/gm, '');
|
| 298 |
+
|
| 299 |
+
await fs.writeFile(tempMdFile, cleanMarkdown);
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
console.log('> Converting to LaTeX with Pandoc...');
|
| 303 |
+
const outputLatex = resolve(cwd, 'dist', `${outFileBase}.tex`);
|
| 304 |
+
|
| 305 |
+
// Ensure dist directory exists
|
| 306 |
+
await fs.mkdir(resolve(cwd, 'dist'), { recursive: true });
|
| 307 |
+
|
| 308 |
+
// Pandoc conversion arguments
|
| 309 |
+
const pandocArgs = [
|
| 310 |
+
tempMdFile,
|
| 311 |
+
'-o', outputLatex,
|
| 312 |
+
'--from=markdown-yaml_metadata_block', // Explicitly exclude YAML metadata parsing
|
| 313 |
+
'--to=latex',
|
| 314 |
+
'--standalone',
|
| 315 |
+
'--toc',
|
| 316 |
+
'--number-sections',
|
| 317 |
+
'--highlight-style=tango',
|
| 318 |
+
'--listings'
|
| 319 |
+
];
|
| 320 |
+
|
| 321 |
+
// Add bibliography if it exists
|
| 322 |
+
const bibFile = resolve(contentDir, 'bibliography.bib');
|
| 323 |
+
try {
|
| 324 |
+
await fs.access(bibFile);
|
| 325 |
+
pandocArgs.push('--bibliography', bibFile);
|
| 326 |
+
pandocArgs.push('--citeproc');
|
| 327 |
+
console.log('✅ Found bibliography file, including citations');
|
| 328 |
+
} catch {
|
| 329 |
+
console.log('ℹ️ No bibliography file found');
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
try {
|
| 333 |
+
await run('pandoc', pandocArgs);
|
| 334 |
+
console.log(`✅ LaTeX generated: ${outputLatex}`);
|
| 335 |
+
|
| 336 |
+
// Optionally compile to PDF if requested
|
| 337 |
+
if (args.pdf) {
|
| 338 |
+
console.log('> Compiling LaTeX to PDF...');
|
| 339 |
+
const outputPdf = resolve(cwd, 'dist', `${outFileBase}.pdf`);
|
| 340 |
+
await run('pdflatex', ['-output-directory', resolve(cwd, 'dist'), outputLatex]);
|
| 341 |
+
console.log(`✅ PDF generated: ${outputPdf}`);
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
} catch (error) {
|
| 345 |
+
console.error('❌ Pandoc conversion failed:', error.message);
|
| 346 |
+
process.exit(1);
|
| 347 |
+
} finally {
|
| 348 |
+
// Clean up temporary file
|
| 349 |
+
try {
|
| 350 |
+
await fs.unlink(tempMdFile);
|
| 351 |
+
} catch { }
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
main().catch((err) => {
|
| 356 |
+
console.error(err);
|
| 357 |
+
process.exit(1);
|
| 358 |
+
});
|
app/scripts/export-pdf.mjs
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
import { spawn } from 'node:child_process';
|
| 3 |
+
import { setTimeout as delay } from 'node:timers/promises';
|
| 4 |
+
import { chromium } from 'playwright';
|
| 5 |
+
import { resolve } from 'node:path';
|
| 6 |
+
import { promises as fs } from 'node:fs';
|
| 7 |
+
import process from 'node:process';
|
| 8 |
+
|
| 9 |
+
async function run(command, args = [], options = {}) {
|
| 10 |
+
return new Promise((resolvePromise, reject) => {
|
| 11 |
+
const child = spawn(command, args, { stdio: 'inherit', shell: false, ...options });
|
| 12 |
+
child.on('error', reject);
|
| 13 |
+
child.on('exit', (code) => {
|
| 14 |
+
if (code === 0) resolvePromise(undefined);
|
| 15 |
+
else reject(new Error(`${command} ${args.join(' ')} exited with code ${code}`));
|
| 16 |
+
});
|
| 17 |
+
});
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
async function waitForServer(url, timeoutMs = 60000) {
|
| 21 |
+
const start = Date.now();
|
| 22 |
+
while (Date.now() - start < timeoutMs) {
|
| 23 |
+
try {
|
| 24 |
+
const res = await fetch(url);
|
| 25 |
+
if (res.ok) return;
|
| 26 |
+
} catch {}
|
| 27 |
+
await delay(500);
|
| 28 |
+
}
|
| 29 |
+
throw new Error(`Server did not start in time: ${url}`);
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
function parseArgs(argv) {
|
| 33 |
+
const out = {};
|
| 34 |
+
for (const arg of argv.slice(2)) {
|
| 35 |
+
if (!arg.startsWith('--')) continue;
|
| 36 |
+
const [k, v] = arg.replace(/^--/, '').split('=');
|
| 37 |
+
out[k] = v === undefined ? true : v;
|
| 38 |
+
}
|
| 39 |
+
return out;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
function slugify(text) {
|
| 43 |
+
return String(text || '')
|
| 44 |
+
.normalize('NFKD')
|
| 45 |
+
.replace(/\p{Diacritic}+/gu, '')
|
| 46 |
+
.toLowerCase()
|
| 47 |
+
.replace(/[^a-z0-9]+/g, '-')
|
| 48 |
+
.replace(/^-+|-+$/g, '')
|
| 49 |
+
.slice(0, 120) || 'article';
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
function parseMargin(margin) {
|
| 53 |
+
if (!margin) return { top: '12mm', right: '12mm', bottom: '16mm', left: '12mm' };
|
| 54 |
+
const parts = String(margin).split(',').map(s => s.trim()).filter(Boolean);
|
| 55 |
+
if (parts.length === 1) {
|
| 56 |
+
return { top: parts[0], right: parts[0], bottom: parts[0], left: parts[0] };
|
| 57 |
+
}
|
| 58 |
+
if (parts.length === 2) {
|
| 59 |
+
return { top: parts[0], right: parts[1], bottom: parts[0], left: parts[1] };
|
| 60 |
+
}
|
| 61 |
+
if (parts.length === 3) {
|
| 62 |
+
return { top: parts[0], right: parts[1], bottom: parts[2], left: parts[1] };
|
| 63 |
+
}
|
| 64 |
+
return { top: parts[0] || '12mm', right: parts[1] || '12mm', bottom: parts[2] || '16mm', left: parts[3] || '12mm' };
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
function cssLengthToMm(val) {
|
| 68 |
+
if (!val) return 0;
|
| 69 |
+
const s = String(val).trim();
|
| 70 |
+
if (/mm$/i.test(s)) return parseFloat(s);
|
| 71 |
+
if (/cm$/i.test(s)) return parseFloat(s) * 10;
|
| 72 |
+
if (/in$/i.test(s)) return parseFloat(s) * 25.4;
|
| 73 |
+
if (/px$/i.test(s)) return (parseFloat(s) / 96) * 25.4; // 96 CSS px per inch
|
| 74 |
+
const num = parseFloat(s);
|
| 75 |
+
return Number.isFinite(num) ? num : 0; // assume mm if unitless
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
function getFormatSizeMm(format) {
|
| 79 |
+
const f = String(format || 'A4').toLowerCase();
|
| 80 |
+
switch (f) {
|
| 81 |
+
case 'letter': return { w: 215.9, h: 279.4 };
|
| 82 |
+
case 'legal': return { w: 215.9, h: 355.6 };
|
| 83 |
+
case 'a3': return { w: 297, h: 420 };
|
| 84 |
+
case 'tabloid': return { w: 279.4, h: 431.8 };
|
| 85 |
+
case 'a4':
|
| 86 |
+
default: return { w: 210, h: 297 };
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
async function waitForImages(page, timeoutMs = 15000) {
|
| 91 |
+
await page.evaluate(async (timeout) => {
|
| 92 |
+
const deadline = Date.now() + timeout;
|
| 93 |
+
const imgs = Array.from(document.images || []);
|
| 94 |
+
const unloaded = imgs.filter(img => !img.complete || (img.naturalWidth === 0));
|
| 95 |
+
await Promise.race([
|
| 96 |
+
Promise.all(unloaded.map(img => new Promise(res => {
|
| 97 |
+
if (img.complete && img.naturalWidth !== 0) return res(undefined);
|
| 98 |
+
img.addEventListener('load', () => res(undefined), { once: true });
|
| 99 |
+
img.addEventListener('error', () => res(undefined), { once: true });
|
| 100 |
+
}))),
|
| 101 |
+
new Promise(res => setTimeout(res, Math.max(0, deadline - Date.now())))
|
| 102 |
+
]);
|
| 103 |
+
}, timeoutMs);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
async function waitForPlotly(page, timeoutMs = 20000) {
|
| 107 |
+
await page.evaluate(async (timeout) => {
|
| 108 |
+
const start = Date.now();
|
| 109 |
+
const hasPlots = () => Array.from(document.querySelectorAll('.js-plotly-plot')).length > 0;
|
| 110 |
+
// Wait until plots exist or timeout
|
| 111 |
+
while (!hasPlots() && (Date.now() - start) < timeout) {
|
| 112 |
+
await new Promise(r => setTimeout(r, 200));
|
| 113 |
+
}
|
| 114 |
+
const deadline = start + timeout;
|
| 115 |
+
// Then wait until each plot contains the main svg
|
| 116 |
+
const allReady = () => Array.from(document.querySelectorAll('.js-plotly-plot')).every(el => el.querySelector('svg.main-svg'));
|
| 117 |
+
while (!allReady() && Date.now() < deadline) {
|
| 118 |
+
await new Promise(r => setTimeout(r, 200));
|
| 119 |
+
}
|
| 120 |
+
}, timeoutMs);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
async function waitForD3(page, timeoutMs = 20000) {
|
| 124 |
+
await page.evaluate(async (timeout) => {
|
| 125 |
+
const start = Date.now();
|
| 126 |
+
const isReady = () => {
|
| 127 |
+
// Prioritize hero banner if present (generic container)
|
| 128 |
+
const hero = document.querySelector('.hero-banner');
|
| 129 |
+
if (hero) {
|
| 130 |
+
return !!hero.querySelector('svg circle, svg path, svg rect, svg g');
|
| 131 |
+
}
|
| 132 |
+
// Else require all D3 containers on page to have shapes
|
| 133 |
+
const containers = [
|
| 134 |
+
...Array.from(document.querySelectorAll('.d3-line')),
|
| 135 |
+
...Array.from(document.querySelectorAll('.d3-bar'))
|
| 136 |
+
];
|
| 137 |
+
if (!containers.length) return true;
|
| 138 |
+
return containers.every(c => c.querySelector('svg circle, svg path, svg rect, svg g'));
|
| 139 |
+
};
|
| 140 |
+
while (!isReady() && (Date.now() - start) < timeout) {
|
| 141 |
+
await new Promise(r => setTimeout(r, 200));
|
| 142 |
+
}
|
| 143 |
+
}, timeoutMs);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
async function waitForStableLayout(page, timeoutMs = 5000) {
|
| 147 |
+
const start = Date.now();
|
| 148 |
+
let last = await page.evaluate(() => document.scrollingElement ? document.scrollingElement.scrollHeight : document.body.scrollHeight);
|
| 149 |
+
let stableCount = 0;
|
| 150 |
+
while ((Date.now() - start) < timeoutMs && stableCount < 3) {
|
| 151 |
+
await page.waitForTimeout(250);
|
| 152 |
+
const now = await page.evaluate(() => document.scrollingElement ? document.scrollingElement.scrollHeight : document.body.scrollHeight);
|
| 153 |
+
if (now === last) stableCount += 1; else { stableCount = 0; last = now; }
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
async function main() {
|
| 158 |
+
const cwd = process.cwd();
|
| 159 |
+
const port = Number(process.env.PREVIEW_PORT || 8080);
|
| 160 |
+
const baseUrl = `http://127.0.0.1:${port}/`;
|
| 161 |
+
const args = parseArgs(process.argv);
|
| 162 |
+
// Default: light (do not rely on env vars implicitly)
|
| 163 |
+
const theme = (args.theme === 'dark' || args.theme === 'light') ? args.theme : 'light';
|
| 164 |
+
const format = args.format || 'A4';
|
| 165 |
+
const margin = parseMargin(args.margin);
|
| 166 |
+
const wait = (args.wait || 'full'); // 'networkidle' | 'images' | 'plotly' | 'full'
|
| 167 |
+
|
| 168 |
+
// filename can be provided, else computed from DOM (button) or page title later
|
| 169 |
+
let outFileBase = (args.filename && String(args.filename).replace(/\.pdf$/i, '')) || 'article';
|
| 170 |
+
|
| 171 |
+
// Build only if dist/ does not exist
|
| 172 |
+
const distDir = resolve(cwd, 'dist');
|
| 173 |
+
let hasDist = false;
|
| 174 |
+
try {
|
| 175 |
+
const st = await fs.stat(distDir);
|
| 176 |
+
hasDist = st && st.isDirectory();
|
| 177 |
+
} catch {}
|
| 178 |
+
if (!hasDist) {
|
| 179 |
+
console.log('> Building Astro site…');
|
| 180 |
+
await run('npm', ['run', 'build']);
|
| 181 |
+
} else {
|
| 182 |
+
console.log('> Skipping build (dist/ exists)…');
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
console.log('> Starting Astro preview…');
|
| 186 |
+
// Start preview in its own process group so we can terminate all children reliably
|
| 187 |
+
const preview = spawn('npm', ['run', 'preview'], { cwd, stdio: 'inherit', detached: true });
|
| 188 |
+
const previewExit = new Promise((resolvePreview) => {
|
| 189 |
+
preview.on('close', (code, signal) => resolvePreview({ code, signal }));
|
| 190 |
+
});
|
| 191 |
+
|
| 192 |
+
try {
|
| 193 |
+
await waitForServer(baseUrl, 60000);
|
| 194 |
+
console.log('> Server ready, generating PDF…');
|
| 195 |
+
|
| 196 |
+
const browser = await chromium.launch({ headless: true });
|
| 197 |
+
try {
|
| 198 |
+
const context = await browser.newContext();
|
| 199 |
+
await context.addInitScript((desired) => {
|
| 200 |
+
try {
|
| 201 |
+
localStorage.setItem('theme', desired);
|
| 202 |
+
// Apply theme immediately to avoid flashes
|
| 203 |
+
if (document && document.documentElement) {
|
| 204 |
+
document.documentElement.dataset.theme = desired;
|
| 205 |
+
}
|
| 206 |
+
} catch {}
|
| 207 |
+
}, theme);
|
| 208 |
+
const page = await context.newPage();
|
| 209 |
+
// Pre-fit viewport width to printable width so charts size correctly
|
| 210 |
+
const fmt = getFormatSizeMm(format);
|
| 211 |
+
const mw = fmt.w - cssLengthToMm(margin.left) - cssLengthToMm(margin.right);
|
| 212 |
+
const printableWidthPx = Math.max(320, Math.round((mw / 25.4) * 96));
|
| 213 |
+
await page.setViewportSize({ width: printableWidthPx, height: 1200 });
|
| 214 |
+
await page.goto(baseUrl, { waitUntil: 'load', timeout: 60000 });
|
| 215 |
+
// Give time for CDN scripts (Plotly/D3) to attach and for our fragment hooks to run
|
| 216 |
+
try { await page.waitForFunction(() => !!window.Plotly, { timeout: 8000 }); } catch {}
|
| 217 |
+
try { await page.waitForFunction(() => !!window.d3, { timeout: 8000 }); } catch {}
|
| 218 |
+
// Prefer explicit filename from the download button if present
|
| 219 |
+
if (!args.filename) {
|
| 220 |
+
const fromBtn = await page.evaluate(() => {
|
| 221 |
+
const btn = document.getElementById('download-pdf-btn');
|
| 222 |
+
const f = btn ? btn.getAttribute('data-pdf-filename') : null;
|
| 223 |
+
return f || '';
|
| 224 |
+
});
|
| 225 |
+
if (fromBtn) {
|
| 226 |
+
outFileBase = String(fromBtn).replace(/\.pdf$/i, '');
|
| 227 |
+
} else {
|
| 228 |
+
// Fallback: compute slug from hero title or document.title
|
| 229 |
+
const title = await page.evaluate(() => {
|
| 230 |
+
const h1 = document.querySelector('h1.hero-title');
|
| 231 |
+
const t = h1 ? h1.textContent : document.title;
|
| 232 |
+
return (t || '').replace(/\s+/g, ' ').trim();
|
| 233 |
+
});
|
| 234 |
+
outFileBase = slugify(title);
|
| 235 |
+
}
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
// Wait for render readiness
|
| 239 |
+
if (wait === 'images' || wait === 'full') {
|
| 240 |
+
await waitForImages(page);
|
| 241 |
+
}
|
| 242 |
+
if (wait === 'd3' || wait === 'full') {
|
| 243 |
+
await waitForD3(page);
|
| 244 |
+
}
|
| 245 |
+
if (wait === 'plotly' || wait === 'full') {
|
| 246 |
+
await waitForPlotly(page);
|
| 247 |
+
}
|
| 248 |
+
if (wait === 'full') {
|
| 249 |
+
await waitForStableLayout(page);
|
| 250 |
+
}
|
| 251 |
+
await page.emulateMedia({ media: 'print' });
|
| 252 |
+
|
| 253 |
+
// Enforce responsive sizing for SVG/iframes by removing hard attrs and injecting CSS (top-level and inside same-origin iframes)
|
| 254 |
+
try {
|
| 255 |
+
await page.evaluate(() => {
|
| 256 |
+
function isSmallSvg(svg){
|
| 257 |
+
try {
|
| 258 |
+
const vb = svg && svg.viewBox && svg.viewBox.baseVal ? svg.viewBox.baseVal : null;
|
| 259 |
+
if (vb && vb.width && vb.height && vb.width <= 50 && vb.height <= 50) return true;
|
| 260 |
+
const r = svg.getBoundingClientRect && svg.getBoundingClientRect();
|
| 261 |
+
if (r && r.width && r.height && r.width <= 50 && r.height <= 50) return true;
|
| 262 |
+
} catch {}
|
| 263 |
+
return false;
|
| 264 |
+
}
|
| 265 |
+
function lockSmallSvgSize(svg){
|
| 266 |
+
try {
|
| 267 |
+
const r = svg.getBoundingClientRect ? svg.getBoundingClientRect() : null;
|
| 268 |
+
const w = (r && r.width) ? Math.round(r.width) : null;
|
| 269 |
+
const h = (r && r.height) ? Math.round(r.height) : null;
|
| 270 |
+
if (w) svg.style.setProperty('width', w + 'px', 'important');
|
| 271 |
+
if (h) svg.style.setProperty('height', h + 'px', 'important');
|
| 272 |
+
svg.style.setProperty('max-width', 'none', 'important');
|
| 273 |
+
} catch {}
|
| 274 |
+
}
|
| 275 |
+
function fixSvg(svg){
|
| 276 |
+
if (!svg) return;
|
| 277 |
+
// Do not alter hero banner SVG sizing; it may rely on explicit width/height
|
| 278 |
+
try { if (svg.closest && svg.closest('.hero-banner')) return; } catch {}
|
| 279 |
+
if (isSmallSvg(svg)) { lockSmallSvgSize(svg); return; }
|
| 280 |
+
try { svg.removeAttribute('width'); } catch {}
|
| 281 |
+
try { svg.removeAttribute('height'); } catch {}
|
| 282 |
+
svg.style.maxWidth = '100%';
|
| 283 |
+
svg.style.width = '100%';
|
| 284 |
+
svg.style.height = 'auto';
|
| 285 |
+
if (!svg.getAttribute('preserveAspectRatio')) svg.setAttribute('preserveAspectRatio','xMidYMid meet');
|
| 286 |
+
}
|
| 287 |
+
document.querySelectorAll('svg').forEach(fixSvg);
|
| 288 |
+
document.querySelectorAll('.mermaid, .mermaid svg').forEach((el)=>{
|
| 289 |
+
if (el.tagName && el.tagName.toLowerCase() === 'svg') fixSvg(el);
|
| 290 |
+
else { el.style.display='block'; el.style.width='100%'; el.style.maxWidth='100%'; }
|
| 291 |
+
});
|
| 292 |
+
document.querySelectorAll('iframe, embed, object').forEach((el) => {
|
| 293 |
+
el.style.width = '100%';
|
| 294 |
+
el.style.maxWidth = '100%';
|
| 295 |
+
try { el.removeAttribute('width'); } catch {}
|
| 296 |
+
// Best-effort inject into same-origin frames
|
| 297 |
+
try {
|
| 298 |
+
const doc = (el.tagName.toLowerCase()==='object' ? el.contentDocument : el.contentDocument);
|
| 299 |
+
if (doc && doc.head) {
|
| 300 |
+
const s = doc.createElement('style');
|
| 301 |
+
s.textContent = 'html,body{overflow-x:hidden;} svg,canvas,img,video{max-width:100%!important;height:auto!important;} svg[width]{width:100%!important}';
|
| 302 |
+
doc.head.appendChild(s);
|
| 303 |
+
doc.querySelectorAll('svg').forEach((svg)=>{ if (isSmallSvg(svg)) lockSmallSvgSize(svg); else fixSvg(svg); });
|
| 304 |
+
}
|
| 305 |
+
} catch (_) { /* cross-origin; ignore */ }
|
| 306 |
+
});
|
| 307 |
+
});
|
| 308 |
+
} catch {}
|
| 309 |
+
|
| 310 |
+
// Generate OG thumbnail (1200x630)
|
| 311 |
+
try {
|
| 312 |
+
const ogW = 1200, ogH = 630;
|
| 313 |
+
await page.setViewportSize({ width: ogW, height: ogH });
|
| 314 |
+
// Give layout a tick to adjust
|
| 315 |
+
await page.waitForTimeout(200);
|
| 316 |
+
// Ensure layout & D3 re-rendered after viewport change
|
| 317 |
+
await page.evaluate(() => { window.scrollTo(0, 0); window.dispatchEvent(new Event('resize')); });
|
| 318 |
+
try { await waitForD3(page, 8000); } catch {}
|
| 319 |
+
|
| 320 |
+
// Temporarily improve visibility for light theme thumbnails
|
| 321 |
+
// - Force normal blend for points
|
| 322 |
+
// - Ensure an SVG background (CSS background on svg element)
|
| 323 |
+
const cssHandle = await page.addStyleTag({ content: `
|
| 324 |
+
.hero .points { mix-blend-mode: normal !important; }
|
| 325 |
+
` });
|
| 326 |
+
const thumbPath = resolve(cwd, 'dist', 'thumb.auto.jpg');
|
| 327 |
+
await page.screenshot({ path: thumbPath, type: 'jpeg', quality: 85, fullPage: false });
|
| 328 |
+
// Also emit PNG for compatibility if needed
|
| 329 |
+
const thumbPngPath = resolve(cwd, 'dist', 'thumb.auto.png');
|
| 330 |
+
await page.screenshot({ path: thumbPngPath, type: 'png', fullPage: false });
|
| 331 |
+
const publicThumb = resolve(cwd, 'public', 'thumb.auto.jpg');
|
| 332 |
+
const publicThumbPng = resolve(cwd, 'public', 'thumb.auto.png');
|
| 333 |
+
try { await fs.copyFile(thumbPath, publicThumb); } catch {}
|
| 334 |
+
try { await fs.copyFile(thumbPngPath, publicThumbPng); } catch {}
|
| 335 |
+
// Remove temporary style so PDF is unaffected
|
| 336 |
+
try { await cssHandle.evaluate((el) => el.remove()); } catch {}
|
| 337 |
+
console.log(`✅ OG thumbnail generated: ${thumbPath}`);
|
| 338 |
+
} catch (e) {
|
| 339 |
+
console.warn('Unable to generate OG thumbnail:', e?.message || e);
|
| 340 |
+
}
|
| 341 |
+
const outPath = resolve(cwd, 'dist', `${outFileBase}.pdf`);
|
| 342 |
+
// Restore viewport to printable width before PDF (thumbnail changed it)
|
| 343 |
+
try {
|
| 344 |
+
const fmt2 = getFormatSizeMm(format);
|
| 345 |
+
const mw2 = fmt2.w - cssLengthToMm(margin.left) - cssLengthToMm(margin.right);
|
| 346 |
+
const printableWidthPx2 = Math.max(320, Math.round((mw2 / 25.4) * 96));
|
| 347 |
+
await page.setViewportSize({ width: printableWidthPx2, height: 1400 });
|
| 348 |
+
await page.evaluate(() => { window.scrollTo(0, 0); window.dispatchEvent(new Event('resize')); });
|
| 349 |
+
try { await waitForD3(page, 8000); } catch {}
|
| 350 |
+
await waitForStableLayout(page);
|
| 351 |
+
// Re-apply responsive fixes after viewport change
|
| 352 |
+
try {
|
| 353 |
+
await page.evaluate(() => {
|
| 354 |
+
function isSmallSvg(svg){
|
| 355 |
+
try {
|
| 356 |
+
const vb = svg && svg.viewBox && svg.viewBox.baseVal ? svg.viewBox.baseVal : null;
|
| 357 |
+
if (vb && vb.width && vb.height && vb.width <= 50 && vb.height <= 50) return true;
|
| 358 |
+
const r = svg.getBoundingClientRect && svg.getBoundingClientRect();
|
| 359 |
+
if (r && r.width && r.height && r.width <= 50 && r.height <= 50) return true;
|
| 360 |
+
} catch {}
|
| 361 |
+
return false;
|
| 362 |
+
}
|
| 363 |
+
function lockSmallSvgSize(svg){
|
| 364 |
+
try {
|
| 365 |
+
const r = svg.getBoundingClientRect ? svg.getBoundingClientRect() : null;
|
| 366 |
+
const w = (r && r.width) ? Math.round(r.width) : null;
|
| 367 |
+
const h = (r && r.height) ? Math.round(r.height) : null;
|
| 368 |
+
if (w) svg.style.setProperty('width', w + 'px', 'important');
|
| 369 |
+
if (h) svg.style.setProperty('height', h + 'px', 'important');
|
| 370 |
+
svg.style.setProperty('max-width', 'none', 'important');
|
| 371 |
+
} catch {}
|
| 372 |
+
}
|
| 373 |
+
function fixSvg(svg){
|
| 374 |
+
if (!svg) return;
|
| 375 |
+
// Do not alter hero banner SVG sizing; it may rely on explicit width/height
|
| 376 |
+
try { if (svg.closest && svg.closest('.hero-banner')) return; } catch {}
|
| 377 |
+
if (isSmallSvg(svg)) { lockSmallSvgSize(svg); return; }
|
| 378 |
+
try { svg.removeAttribute('width'); } catch {}
|
| 379 |
+
try { svg.removeAttribute('height'); } catch {}
|
| 380 |
+
svg.style.maxWidth = '100%';
|
| 381 |
+
svg.style.width = '100%';
|
| 382 |
+
svg.style.height = 'auto';
|
| 383 |
+
if (!svg.getAttribute('preserveAspectRatio')) svg.setAttribute('preserveAspectRatio','xMidYMid meet');
|
| 384 |
+
}
|
| 385 |
+
document.querySelectorAll('svg').forEach((svg)=>{ if (isSmallSvg(svg)) lockSmallSvgSize(svg); else fixSvg(svg); });
|
| 386 |
+
document.querySelectorAll('.mermaid, .mermaid svg').forEach((el)=>{
|
| 387 |
+
if (el.tagName && el.tagName.toLowerCase() === 'svg') fixSvg(el);
|
| 388 |
+
else { el.style.display='block'; el.style.width='100%'; el.style.maxWidth='100%'; }
|
| 389 |
+
});
|
| 390 |
+
document.querySelectorAll('iframe, embed, object').forEach((el) => {
|
| 391 |
+
el.style.width = '100%';
|
| 392 |
+
el.style.maxWidth = '100%';
|
| 393 |
+
try { el.removeAttribute('width'); } catch {}
|
| 394 |
+
try {
|
| 395 |
+
const doc = (el.tagName.toLowerCase()==='object' ? el.contentDocument : el.contentDocument);
|
| 396 |
+
if (doc && doc.head) {
|
| 397 |
+
const s = doc.createElement('style');
|
| 398 |
+
s.textContent = 'html,body{overflow-x:hidden;} svg,canvas,img,video{max-width:100%!important;height:auto!important;} svg[width]{width:100%!important}';
|
| 399 |
+
doc.head.appendChild(s);
|
| 400 |
+
doc.querySelectorAll('svg').forEach((svg)=>{ if (isSmallSvg(svg)) lockSmallSvgSize(svg); else fixSvg(svg); });
|
| 401 |
+
}
|
| 402 |
+
} catch (_) {}
|
| 403 |
+
});
|
| 404 |
+
});
|
| 405 |
+
} catch {}
|
| 406 |
+
} catch {}
|
| 407 |
+
// Temporarily enforce print-safe responsive sizing (SVG/iframes) and improve banner visibility
|
| 408 |
+
let pdfCssHandle = null;
|
| 409 |
+
try {
|
| 410 |
+
pdfCssHandle = await page.addStyleTag({ content: `
|
| 411 |
+
/* General container safety */
|
| 412 |
+
html, body { overflow-x: hidden !important; }
|
| 413 |
+
|
| 414 |
+
/* Make all vector/bitmap media responsive for print */
|
| 415 |
+
svg, canvas, img, video { max-width: 100% !important; height: auto !important; }
|
| 416 |
+
/* Mermaid diagrams */
|
| 417 |
+
.mermaid, .mermaid svg { display: block; width: 100% !important; max-width: 100% !important; height: auto !important; }
|
| 418 |
+
/* Any explicit width attributes */
|
| 419 |
+
svg[width] { width: 100% !important; }
|
| 420 |
+
/* Iframes and similar embeds */
|
| 421 |
+
iframe, embed, object { width: 100% !important; max-width: 100% !important; height: auto; }
|
| 422 |
+
|
| 423 |
+
/* HtmlEmbed wrappers (defensive) */
|
| 424 |
+
.html-embed, .html-embed__card { max-width: 100% !important; width: 100% !important; }
|
| 425 |
+
.html-embed__card > div[id^="frag-"] { width: 100% !important; max-width: 100% !important; }
|
| 426 |
+
|
| 427 |
+
/* Banner centering & visibility */
|
| 428 |
+
.hero .points { mix-blend-mode: normal !important; }
|
| 429 |
+
/* Do NOT force a fixed height to avoid clipping in PDF */
|
| 430 |
+
.hero-banner { width: 100% !important; max-width: 980px !important; margin-left: auto !important; margin-right: auto !important; }
|
| 431 |
+
.hero-banner svg { width: 100% !important; height: auto !important; }
|
| 432 |
+
` });
|
| 433 |
+
} catch {}
|
| 434 |
+
await page.pdf({
|
| 435 |
+
path: outPath,
|
| 436 |
+
format,
|
| 437 |
+
printBackground: true,
|
| 438 |
+
margin
|
| 439 |
+
});
|
| 440 |
+
try { if (pdfCssHandle) await pdfCssHandle.evaluate((el) => el.remove()); } catch {}
|
| 441 |
+
console.log(`✅ PDF generated: ${outPath}`);
|
| 442 |
+
|
| 443 |
+
// Copy into public only under the slugified name
|
| 444 |
+
const publicSlugPath = resolve(cwd, 'public', `${outFileBase}.pdf`);
|
| 445 |
+
try {
|
| 446 |
+
await fs.mkdir(resolve(cwd, 'public'), { recursive: true });
|
| 447 |
+
await fs.copyFile(outPath, publicSlugPath);
|
| 448 |
+
console.log(`✅ PDF copied to: ${publicSlugPath}`);
|
| 449 |
+
} catch (e) {
|
| 450 |
+
console.warn('Unable to copy PDF to public/:', e?.message || e);
|
| 451 |
+
}
|
| 452 |
+
} finally {
|
| 453 |
+
await browser.close();
|
| 454 |
+
}
|
| 455 |
+
} finally {
|
| 456 |
+
// Try a clean shutdown of preview (entire process group first)
|
| 457 |
+
try {
|
| 458 |
+
if (process.platform !== 'win32') {
|
| 459 |
+
try { process.kill(-preview.pid, 'SIGINT'); } catch {}
|
| 460 |
+
}
|
| 461 |
+
try { preview.kill('SIGINT'); } catch {}
|
| 462 |
+
await Promise.race([previewExit, delay(3000)]);
|
| 463 |
+
// Force kill if still alive
|
| 464 |
+
// eslint-disable-next-line no-unsafe-optional-chaining
|
| 465 |
+
if (!preview.killed) {
|
| 466 |
+
try {
|
| 467 |
+
if (process.platform !== 'win32') {
|
| 468 |
+
try { process.kill(-preview.pid, 'SIGKILL'); } catch {}
|
| 469 |
+
}
|
| 470 |
+
try { preview.kill('SIGKILL'); } catch {}
|
| 471 |
+
} catch {}
|
| 472 |
+
await Promise.race([previewExit, delay(1000)]);
|
| 473 |
+
}
|
| 474 |
+
} catch {}
|
| 475 |
+
}
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
main().catch((err) => {
|
| 479 |
+
console.error(err);
|
| 480 |
+
process.exit(1);
|
| 481 |
+
});
|
| 482 |
+
|
| 483 |
+
|
app/scripts/generate-trackio-data.mjs
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
// Generate synthetic Trackio-like CSV data with realistic ML curves.
|
| 4 |
+
// - Steps are simple integers (e.g., 1..N)
|
| 5 |
+
// - Metrics: epoch, train_accuracy, val_accuracy, train_loss, val_loss
|
| 6 |
+
// - W&B-like run names (e.g., pleasant-flower-1)
|
| 7 |
+
// - Deterministic with --seed
|
| 8 |
+
//
|
| 9 |
+
// Usage:
|
| 10 |
+
// node app/scripts/generate-trackio-data.mjs \
|
| 11 |
+
// --runs 3 \
|
| 12 |
+
// --steps 10 \
|
| 13 |
+
// --out app/src/content/assets/data/trackio_wandb_synth.csv \
|
| 14 |
+
// [--seed 42] [--epoch-max 3.0] [--amount 1.0] [--start 1]
|
| 15 |
+
//
|
| 16 |
+
// To overwrite the demo file used by the embed:
|
| 17 |
+
// node app/scripts/generate-trackio-data.mjs --runs 3 --steps 10 --out app/src/content/assets/data/trackio_wandb_demo.csv --seed 1337
|
| 18 |
+
|
| 19 |
+
import fs from 'node:fs/promises';
|
| 20 |
+
import path from 'node:path';
|
| 21 |
+
|
| 22 |
+
function parseArgs(argv){
|
| 23 |
+
const args = { runs: 3, steps: 10, out: '', seed: undefined, epochMax: 3.0, amount: 1, start: 1 };
|
| 24 |
+
for (let i = 2; i < argv.length; i++){
|
| 25 |
+
const a = argv[i];
|
| 26 |
+
if (a === '--runs' && argv[i+1]) { args.runs = Math.max(1, parseInt(argv[++i], 10) || 3); continue; }
|
| 27 |
+
if (a === '--steps' && argv[i+1]) { args.steps = Math.max(2, parseInt(argv[++i], 10) || 10); continue; }
|
| 28 |
+
if (a === '--out' && argv[i+1]) { args.out = argv[++i]; continue; }
|
| 29 |
+
if (a === '--seed' && argv[i+1]) { args.seed = Number(argv[++i]); continue; }
|
| 30 |
+
if (a === '--epoch-max' && argv[i+1]) { args.epochMax = Number(argv[++i]) || 3.0; continue; }
|
| 31 |
+
if (a === '--amount' && argv[i+1]) { args.amount = Number(argv[++i]) || 1.0; continue; }
|
| 32 |
+
if (a === '--start' && argv[i+1]) { args.start = parseInt(argv[++i], 10) || 1; continue; }
|
| 33 |
+
}
|
| 34 |
+
if (!args.out) {
|
| 35 |
+
args.out = path.join('app', 'src', 'content', 'assets', 'data', 'trackio_wandb_synth.csv');
|
| 36 |
+
}
|
| 37 |
+
return args;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
function mulberry32(seed){
|
| 41 |
+
let t = seed >>> 0;
|
| 42 |
+
return function(){
|
| 43 |
+
t += 0x6D2B79F5;
|
| 44 |
+
let r = Math.imul(t ^ (t >>> 15), 1 | t);
|
| 45 |
+
r ^= r + Math.imul(r ^ (r >>> 7), 61 | r);
|
| 46 |
+
return ((r ^ (r >>> 14)) >>> 0) / 4294967296;
|
| 47 |
+
};
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
function makeRng(seed){
|
| 51 |
+
if (Number.isFinite(seed)) return mulberry32(seed);
|
| 52 |
+
return Math.random;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
function randn(rng){
|
| 56 |
+
// Box-Muller transform
|
| 57 |
+
let u = 0, v = 0;
|
| 58 |
+
while (u === 0) u = rng();
|
| 59 |
+
while (v === 0) v = rng();
|
| 60 |
+
return Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
function clamp(x, lo, hi){
|
| 64 |
+
return Math.max(lo, Math.min(hi, x));
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
function logistic(t, k=6, x0=0.5){
|
| 68 |
+
// 1 / (1 + e^{-k (t - x0)}) in [0,1]
|
| 69 |
+
return 1 / (1 + Math.exp(-k * (t - x0)));
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
function expDecay(t, k=3){
|
| 73 |
+
// (1 - e^{-k t}) in [0,1]
|
| 74 |
+
return 1 - Math.exp(-k * t);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
function pick(array, rng){
|
| 78 |
+
return array[Math.floor(rng() * array.length) % array.length];
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
function buildRunNames(count, rng){
|
| 82 |
+
const adjectives = [
|
| 83 |
+
'pleasant','brisk','silent','ancient','bold','gentle','rapid','shy','curious','lively',
|
| 84 |
+
'fearless','soothing','glossy','hidden','misty','bright','calm','keen','noble','swift'
|
| 85 |
+
];
|
| 86 |
+
const nouns = [
|
| 87 |
+
'flower','glade','sky','river','forest','ember','comet','meadow','harbor','dawn',
|
| 88 |
+
'mountain','prairie','breeze','valley','lagoon','desert','monsoon','reef','thunder','willow'
|
| 89 |
+
];
|
| 90 |
+
const names = new Set();
|
| 91 |
+
let attempts = 0;
|
| 92 |
+
while (names.size < count && attempts < count * 20){
|
| 93 |
+
attempts++;
|
| 94 |
+
const left = pick(adjectives, rng);
|
| 95 |
+
const right = pick(nouns, rng);
|
| 96 |
+
const idx = 1 + Math.floor(rng() * 9);
|
| 97 |
+
names.add(`${left}-${right}-${idx}`);
|
| 98 |
+
}
|
| 99 |
+
return Array.from(names);
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
function formatLike(value, decimals){
|
| 103 |
+
return Number.isFinite(decimals) && decimals >= 0 ? value.toFixed(decimals) : String(value);
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
async function main(){
|
| 107 |
+
const args = parseArgs(process.argv);
|
| 108 |
+
const rng = makeRng(args.seed);
|
| 109 |
+
|
| 110 |
+
// Steps: integers from start .. start+steps-1
|
| 111 |
+
const steps = Array.from({ length: args.steps }, (_, i) => args.start + i);
|
| 112 |
+
const stepNorm = (i) => (i - steps[0]) / (steps[steps.length-1] - steps[0]);
|
| 113 |
+
|
| 114 |
+
const runs = buildRunNames(args.runs, rng);
|
| 115 |
+
|
| 116 |
+
// Per-run slight variations
|
| 117 |
+
const runParams = runs.map((_r, idx) => {
|
| 118 |
+
const r = rng();
|
| 119 |
+
// Final accuracies
|
| 120 |
+
const trainAccFinal = clamp(0.86 + (r - 0.5) * 0.12 * args.amount, 0.78, 0.97);
|
| 121 |
+
const valAccFinal = clamp(trainAccFinal - (0.02 + rng() * 0.05), 0.70, 0.95);
|
| 122 |
+
// Loss plateau
|
| 123 |
+
const lossStart = 7.0 + (rng() - 0.5) * 0.10 * args.amount; // ~7.0 ±0.05
|
| 124 |
+
const lossPlateau = 6.78 + (rng() - 0.5) * 0.04 * args.amount; // ~6.78 ±0.02
|
| 125 |
+
const lossK = 2.0 + rng() * 1.5; // decay speed
|
| 126 |
+
// Acc growth steepness and midpoint
|
| 127 |
+
const kAcc = 4.5 + rng() * 3.0;
|
| 128 |
+
const x0Acc = 0.35 + rng() * 0.25;
|
| 129 |
+
return { trainAccFinal, valAccFinal, lossStart, lossPlateau, lossK, kAcc, x0Acc };
|
| 130 |
+
});
|
| 131 |
+
|
| 132 |
+
const lines = [];
|
| 133 |
+
lines.push('run,step,metric,value,stderr');
|
| 134 |
+
|
| 135 |
+
// EPOCH: linear 0..epochMax across steps
|
| 136 |
+
for (let r = 0; r < runs.length; r++){
|
| 137 |
+
const run = runs[r];
|
| 138 |
+
for (let i = 0; i < steps.length; i++){
|
| 139 |
+
const t = stepNorm(steps[i]);
|
| 140 |
+
const epoch = args.epochMax * t;
|
| 141 |
+
lines.push(`${run},${steps[i]},epoch,${formatLike(epoch, 2)},`);
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
// TRAIN LOSS & VAL LOSS
|
| 146 |
+
for (let r = 0; r < runs.length; r++){
|
| 147 |
+
const run = runs[r];
|
| 148 |
+
const p = runParams[r];
|
| 149 |
+
let prevTrain = null;
|
| 150 |
+
let prevVal = null;
|
| 151 |
+
for (let i = 0; i < steps.length; i++){
|
| 152 |
+
const t = stepNorm(steps[i]);
|
| 153 |
+
const d = expDecay(t, p.lossK); // 0..1
|
| 154 |
+
let trainLoss = p.lossStart - (p.lossStart - p.lossPlateau) * d;
|
| 155 |
+
let valLoss = trainLoss + 0.02 + (rng() * 0.03);
|
| 156 |
+
// Add mild noise
|
| 157 |
+
trainLoss += randn(rng) * 0.01 * args.amount;
|
| 158 |
+
valLoss += randn(rng) * 0.012 * args.amount;
|
| 159 |
+
// Keep reasonable and mostly monotonic (small upward blips allowed)
|
| 160 |
+
if (prevTrain != null) trainLoss = Math.min(prevTrain + 0.01, trainLoss);
|
| 161 |
+
if (prevVal != null) valLoss = Math.min(prevVal + 0.012, valLoss);
|
| 162 |
+
prevTrain = trainLoss; prevVal = valLoss;
|
| 163 |
+
const stderrTrain = clamp(0.03 - 0.02 * t + Math.abs(randn(rng)) * 0.003, 0.006, 0.04);
|
| 164 |
+
const stderrVal = clamp(0.035 - 0.022 * t + Math.abs(randn(rng)) * 0.003, 0.008, 0.045);
|
| 165 |
+
lines.push(`${run},${steps[i]},train_loss,${formatLike(trainLoss, 3)},${formatLike(stderrTrain, 3)}`);
|
| 166 |
+
lines.push(`${run},${steps[i]},val_loss,${formatLike(valLoss, 3)},${formatLike(stderrVal, 3)}`);
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
// TRAIN ACCURACY & VAL ACCURACY (logistic)
|
| 171 |
+
for (let r = 0; r < runs.length; r++){
|
| 172 |
+
const run = runs[r];
|
| 173 |
+
const p = runParams[r];
|
| 174 |
+
for (let i = 0; i < steps.length; i++){
|
| 175 |
+
const t = stepNorm(steps[i]);
|
| 176 |
+
const accBase = logistic(t, p.kAcc, p.x0Acc);
|
| 177 |
+
let trainAcc = clamp(0.55 + accBase * (p.trainAccFinal - 0.55), 0, 1);
|
| 178 |
+
let valAcc = clamp(0.52 + accBase * (p.valAccFinal - 0.52), 0, 1);
|
| 179 |
+
// Gentle noise
|
| 180 |
+
trainAcc = clamp(trainAcc + randn(rng) * 0.005 * args.amount, 0, 1);
|
| 181 |
+
valAcc = clamp(valAcc + randn(rng) * 0.006 * args.amount, 0, 1);
|
| 182 |
+
const stderrTrain = clamp(0.02 - 0.011 * t + Math.abs(randn(rng)) * 0.002, 0.006, 0.03);
|
| 183 |
+
const stderrVal = clamp(0.022 - 0.012 * t + Math.abs(randn(rng)) * 0.002, 0.007, 0.032);
|
| 184 |
+
lines.push(`${run},${steps[i]},train_accuracy,${formatLike(trainAcc, 4)},${formatLike(stderrTrain, 3)}`);
|
| 185 |
+
lines.push(`${run},${steps[i]},val_accuracy,${formatLike(valAcc, 4)},${formatLike(stderrVal, 3)}`);
|
| 186 |
+
}
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// Ensure directory exists
|
| 190 |
+
await fs.mkdir(path.dirname(args.out), { recursive: true });
|
| 191 |
+
await fs.writeFile(args.out, lines.join('\n') + '\n', 'utf8');
|
| 192 |
+
const relOut = path.relative(process.cwd(), args.out);
|
| 193 |
+
console.log(`Synthetic CSV generated: ${relOut}`);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
main().catch(err => { console.error(err?.stack || String(err)); process.exit(1); });
|
app/scripts/generate_ablation_data.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generate ablation study data for D3 line chart embeds.
|
| 4 |
+
|
| 5 |
+
This script generates CSV files for:
|
| 6 |
+
1. From scratch ablation - single learning rate schedule
|
| 7 |
+
2. Annealing ablation - comparison between main pretraining and ablation decay
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
# Parameters
|
| 15 |
+
total_tokens = 100e9 # 100B tokens
|
| 16 |
+
warmup_tokens = 1.2e9 # 1.2B tokens (1.2% of total)
|
| 17 |
+
max_lr = 2e-4
|
| 18 |
+
|
| 19 |
+
def generate_from_scratch_schedule():
|
| 20 |
+
"""Generate from scratch learning rate schedule"""
|
| 21 |
+
# Based on the HTML: warmup to 5%, plateau until 85%, then decay
|
| 22 |
+
warmup_end = 0.05 # 5% of total tokens
|
| 23 |
+
decay_start = 0.85 # 85% of total tokens
|
| 24 |
+
|
| 25 |
+
schedule = []
|
| 26 |
+
for i in range(1000): # 1000 points for smooth curve
|
| 27 |
+
progress = i / 999 # 0 to 1
|
| 28 |
+
|
| 29 |
+
if progress < warmup_end:
|
| 30 |
+
# Linear warmup
|
| 31 |
+
lr = max_lr * (progress / warmup_end)
|
| 32 |
+
elif progress < decay_start:
|
| 33 |
+
# Plateau at max LR
|
| 34 |
+
lr = max_lr
|
| 35 |
+
else:
|
| 36 |
+
# Linear decay to 0
|
| 37 |
+
decay_progress = (progress - decay_start) / (1 - decay_start)
|
| 38 |
+
lr = max_lr * (1 - decay_progress)
|
| 39 |
+
|
| 40 |
+
tokens = progress * total_tokens
|
| 41 |
+
schedule.append({
|
| 42 |
+
'run_name': 'From scratch',
|
| 43 |
+
'tokens': tokens,
|
| 44 |
+
'learning_rate': lr
|
| 45 |
+
})
|
| 46 |
+
|
| 47 |
+
return schedule
|
| 48 |
+
|
| 49 |
+
def generate_annealing_schedules():
|
| 50 |
+
"""Generate annealing ablation schedules"""
|
| 51 |
+
# Main pretraining run parameters
|
| 52 |
+
main_warmup_end = 0.012 # 1.2% of total tokens
|
| 53 |
+
main_decay_start = 0.80 # 80% of total tokens
|
| 54 |
+
main_end = 0.95 # 95% of total tokens
|
| 55 |
+
|
| 56 |
+
# Ablation run parameters
|
| 57 |
+
ablation_start = 0.64 # 64% of total tokens
|
| 58 |
+
ablation_duration = 0.10 # 10% of total tokens
|
| 59 |
+
ablation_end = ablation_start + ablation_duration
|
| 60 |
+
|
| 61 |
+
schedules = []
|
| 62 |
+
|
| 63 |
+
# Main pretraining run
|
| 64 |
+
for i in range(1000):
|
| 65 |
+
progress = i / 999
|
| 66 |
+
|
| 67 |
+
if progress < main_warmup_end:
|
| 68 |
+
lr = max_lr * (progress / main_warmup_end)
|
| 69 |
+
elif progress < main_decay_start:
|
| 70 |
+
lr = max_lr
|
| 71 |
+
elif progress < main_end:
|
| 72 |
+
# Linear decay
|
| 73 |
+
decay_progress = (progress - main_decay_start) / (main_end - main_decay_start)
|
| 74 |
+
lr = max_lr * (1 - decay_progress)
|
| 75 |
+
else:
|
| 76 |
+
lr = 0
|
| 77 |
+
|
| 78 |
+
tokens = progress * total_tokens
|
| 79 |
+
schedules.append({
|
| 80 |
+
'run_name': 'Main pretraining',
|
| 81 |
+
'tokens': tokens,
|
| 82 |
+
'learning_rate': lr
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
# Ablation run (starts from plateau and decays)
|
| 86 |
+
for i in range(1000):
|
| 87 |
+
progress = i / 999
|
| 88 |
+
|
| 89 |
+
if progress < ablation_start:
|
| 90 |
+
lr = max_lr # Plateau
|
| 91 |
+
elif progress < ablation_end:
|
| 92 |
+
# Linear decay during ablation period
|
| 93 |
+
decay_progress = (progress - ablation_start) / (ablation_end - ablation_start)
|
| 94 |
+
lr = max_lr * (1 - decay_progress)
|
| 95 |
+
else:
|
| 96 |
+
lr = 0
|
| 97 |
+
|
| 98 |
+
tokens = progress * total_tokens
|
| 99 |
+
schedules.append({
|
| 100 |
+
'run_name': 'Ablation decay',
|
| 101 |
+
'tokens': tokens,
|
| 102 |
+
'learning_rate': lr
|
| 103 |
+
})
|
| 104 |
+
|
| 105 |
+
return schedules
|
| 106 |
+
|
| 107 |
+
def main():
|
| 108 |
+
# Create output directory if it doesn't exist
|
| 109 |
+
output_dir = "src/content/assets/data"
|
| 110 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
print("Generating ablation study data...")
|
| 113 |
+
|
| 114 |
+
# Generate from scratch schedule
|
| 115 |
+
from_scratch_data = generate_from_scratch_schedule()
|
| 116 |
+
df_from_scratch = pd.DataFrame(from_scratch_data)
|
| 117 |
+
df_from_scratch.to_csv(f'{output_dir}/from_scratch_ablation.csv', index=False)
|
| 118 |
+
print(f"✓ Saved {output_dir}/from_scratch_ablation.csv with {len(df_from_scratch)} rows")
|
| 119 |
+
|
| 120 |
+
# Generate annealing schedules
|
| 121 |
+
annealing_data = generate_annealing_schedules()
|
| 122 |
+
df_annealing = pd.DataFrame(annealing_data)
|
| 123 |
+
df_annealing.to_csv(f'{output_dir}/annealing_ablation.csv', index=False)
|
| 124 |
+
print(f"✓ Saved {output_dir}/annealing_ablation.csv with {len(df_annealing)} rows")
|
| 125 |
+
|
| 126 |
+
print("\n✓ Done! CSV files generated successfully.")
|
| 127 |
+
print("\nNext steps:")
|
| 128 |
+
print("1. Use from_scratch_ablation.csv for the first plot")
|
| 129 |
+
print("2. Use annealing_ablation.csv for the second plot")
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
main()
|
app/scripts/generate_ablation_data_correct.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generate ablation study data for D3 line chart embeds.
|
| 4 |
+
|
| 5 |
+
This script generates CSV files for:
|
| 6 |
+
1. From scratch ablation - single learning rate schedule
|
| 7 |
+
2. Annealing ablation - comparison between main pretraining and ablation decay
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
# Parameters
|
| 15 |
+
max_lr = 2e-4
|
| 16 |
+
|
| 17 |
+
def generate_from_scratch_schedule():
|
| 18 |
+
"""Generate from scratch learning rate schedule - goes to 100B tokens"""
|
| 19 |
+
total_tokens = 100e9 # 100B tokens
|
| 20 |
+
warmup_end = 0.05 # 5% of total tokens
|
| 21 |
+
decay_start = 0.85 # 85% of total tokens
|
| 22 |
+
|
| 23 |
+
schedule = []
|
| 24 |
+
for i in range(1000): # 1000 points for smooth curve
|
| 25 |
+
progress = i / 999 # 0 to 1
|
| 26 |
+
|
| 27 |
+
if progress < warmup_end:
|
| 28 |
+
# Linear warmup
|
| 29 |
+
lr = max_lr * (progress / warmup_end)
|
| 30 |
+
elif progress < decay_start:
|
| 31 |
+
# Plateau at max LR
|
| 32 |
+
lr = max_lr
|
| 33 |
+
else:
|
| 34 |
+
# Linear decay to 0
|
| 35 |
+
decay_progress = (progress - decay_start) / (1 - decay_start)
|
| 36 |
+
lr = max_lr * (1 - decay_progress)
|
| 37 |
+
|
| 38 |
+
tokens = progress * total_tokens
|
| 39 |
+
schedule.append({
|
| 40 |
+
'run_name': 'From scratch',
|
| 41 |
+
'tokens': tokens,
|
| 42 |
+
'learning_rate': lr
|
| 43 |
+
})
|
| 44 |
+
|
| 45 |
+
# Stop adding points once learning rate reaches 0 (but include the 0 point)
|
| 46 |
+
if lr == 0:
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
# Add the final 0 point if not already added
|
| 50 |
+
if schedule[-1]['learning_rate'] != 0:
|
| 51 |
+
schedule.append({
|
| 52 |
+
'run_name': 'From scratch',
|
| 53 |
+
'tokens': total_tokens,
|
| 54 |
+
'learning_rate': 0
|
| 55 |
+
})
|
| 56 |
+
|
| 57 |
+
return schedule
|
| 58 |
+
|
| 59 |
+
def generate_annealing_schedules():
|
| 60 |
+
"""Generate annealing ablation schedules - goes to 11T tokens"""
|
| 61 |
+
total_tokens = 11e12 # 11T tokens
|
| 62 |
+
main_warmup_end = 0.012 # 1.2% of total tokens
|
| 63 |
+
main_decay_start = 0.80 # 80% of total tokens
|
| 64 |
+
main_end = 0.95 # 95% of total tokens
|
| 65 |
+
|
| 66 |
+
# Ablation run parameters - start earlier so it reaches 0 at 7.1T
|
| 67 |
+
ablation_start = 0.55 # Start earlier
|
| 68 |
+
ablation_end = 0.645 # End at 7.1T (64.5% of 11T)
|
| 69 |
+
|
| 70 |
+
schedules = []
|
| 71 |
+
|
| 72 |
+
# Main pretraining run
|
| 73 |
+
for i in range(1000):
|
| 74 |
+
progress = i / 999
|
| 75 |
+
|
| 76 |
+
if progress < main_warmup_end:
|
| 77 |
+
lr = max_lr * (progress / main_warmup_end)
|
| 78 |
+
elif progress < main_decay_start:
|
| 79 |
+
lr = max_lr
|
| 80 |
+
elif progress < main_end:
|
| 81 |
+
# Linear decay
|
| 82 |
+
decay_progress = (progress - main_decay_start) / (main_end - main_decay_start)
|
| 83 |
+
lr = max_lr * (1 - decay_progress)
|
| 84 |
+
else:
|
| 85 |
+
lr = 0
|
| 86 |
+
|
| 87 |
+
tokens = progress * total_tokens
|
| 88 |
+
schedules.append({
|
| 89 |
+
'run_name': 'Main pretraining',
|
| 90 |
+
'tokens': tokens,
|
| 91 |
+
'learning_rate': lr
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
# Stop adding points once learning rate reaches 0
|
| 95 |
+
if lr == 0:
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
# Ablation run (starts from plateau and decays)
|
| 99 |
+
for i in range(1000):
|
| 100 |
+
progress = i / 999
|
| 101 |
+
|
| 102 |
+
if progress < ablation_start:
|
| 103 |
+
lr = max_lr # Plateau
|
| 104 |
+
elif progress < ablation_end:
|
| 105 |
+
# Linear decay during ablation period
|
| 106 |
+
decay_progress = (progress - ablation_start) / (ablation_end - ablation_start)
|
| 107 |
+
lr = max_lr * (1 - decay_progress)
|
| 108 |
+
else:
|
| 109 |
+
lr = 0
|
| 110 |
+
|
| 111 |
+
tokens = progress * total_tokens
|
| 112 |
+
schedules.append({
|
| 113 |
+
'run_name': 'Ablation decay',
|
| 114 |
+
'tokens': tokens,
|
| 115 |
+
'learning_rate': lr
|
| 116 |
+
})
|
| 117 |
+
|
| 118 |
+
# Stop adding points once learning rate reaches 0
|
| 119 |
+
if lr == 0:
|
| 120 |
+
break
|
| 121 |
+
|
| 122 |
+
# Add the final 0 point for ablation decay if not already added
|
| 123 |
+
if schedules[-1]['learning_rate'] != 0:
|
| 124 |
+
schedules.append({
|
| 125 |
+
'run_name': 'Ablation decay',
|
| 126 |
+
'tokens': total_tokens,
|
| 127 |
+
'learning_rate': 0
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
return schedules
|
| 131 |
+
|
| 132 |
+
def main():
|
| 133 |
+
# Create output directory if it doesn't exist
|
| 134 |
+
output_dir = "src/content/assets/data"
|
| 135 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 136 |
+
|
| 137 |
+
print("Generating ablation study data...")
|
| 138 |
+
|
| 139 |
+
# Generate from scratch schedule
|
| 140 |
+
from_scratch_data = generate_from_scratch_schedule()
|
| 141 |
+
df_from_scratch = pd.DataFrame(from_scratch_data)
|
| 142 |
+
df_from_scratch.to_csv(f'{output_dir}/from_scratch_ablation.csv', index=False)
|
| 143 |
+
print(f"✓ Saved {output_dir}/from_scratch_ablation.csv with {len(df_from_scratch)} rows")
|
| 144 |
+
|
| 145 |
+
# Generate annealing schedules
|
| 146 |
+
annealing_data = generate_annealing_schedules()
|
| 147 |
+
df_annealing = pd.DataFrame(annealing_data)
|
| 148 |
+
df_annealing.to_csv(f'{output_dir}/annealing_ablation.csv', index=False)
|
| 149 |
+
print(f"✓ Saved {output_dir}/annealing_ablation.csv with {len(df_annealing)} rows")
|
| 150 |
+
|
| 151 |
+
print("\n✓ Done! CSV files generated successfully.")
|
| 152 |
+
print("\nNext steps:")
|
| 153 |
+
print("1. Use from_scratch_ablation.csv for the first plot")
|
| 154 |
+
print("2. Use annealing_ablation.csv for the second plot")
|
| 155 |
+
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
main()
|
app/scripts/generate_ablation_data_final.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generate ablation study data for D3 line chart embeds.
|
| 4 |
+
|
| 5 |
+
This script generates CSV files for:
|
| 6 |
+
1. From scratch ablation - single learning rate schedule
|
| 7 |
+
2. Annealing ablation - comparison between main pretraining and ablation decay
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
# Parameters
|
| 15 |
+
max_lr = 2e-4
|
| 16 |
+
|
| 17 |
+
def generate_from_scratch_schedule():
|
| 18 |
+
"""Generate from scratch learning rate schedule - goes to 100B tokens"""
|
| 19 |
+
total_tokens = 100e9 # 100B tokens
|
| 20 |
+
warmup_end = 0.05 # 5% of total tokens
|
| 21 |
+
decay_start = 0.85 # 85% of total tokens
|
| 22 |
+
|
| 23 |
+
schedule = []
|
| 24 |
+
for i in range(1000): # 1000 points for smooth curve
|
| 25 |
+
progress = i / 999 # 0 to 1
|
| 26 |
+
|
| 27 |
+
if progress < warmup_end:
|
| 28 |
+
# Linear warmup
|
| 29 |
+
lr = max_lr * (progress / warmup_end)
|
| 30 |
+
elif progress < decay_start:
|
| 31 |
+
# Plateau at max LR
|
| 32 |
+
lr = max_lr
|
| 33 |
+
else:
|
| 34 |
+
# Linear decay to 0
|
| 35 |
+
decay_progress = (progress - decay_start) / (1 - decay_start)
|
| 36 |
+
lr = max_lr * (1 - decay_progress)
|
| 37 |
+
|
| 38 |
+
tokens = progress * total_tokens
|
| 39 |
+
schedule.append({
|
| 40 |
+
'run_name': 'From scratch',
|
| 41 |
+
'tokens': tokens,
|
| 42 |
+
'learning_rate': lr
|
| 43 |
+
})
|
| 44 |
+
|
| 45 |
+
# Filter out points after learning rate reaches 0
|
| 46 |
+
filtered_schedule = []
|
| 47 |
+
for point in schedule:
|
| 48 |
+
filtered_schedule.append(point)
|
| 49 |
+
if point['learning_rate'] == 0 and len(filtered_schedule) > 1:
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
return filtered_schedule
|
| 53 |
+
|
| 54 |
+
def generate_annealing_schedules():
|
| 55 |
+
"""Generate annealing ablation schedules - goes to 11T tokens"""
|
| 56 |
+
total_tokens = 11e12 # 11T tokens
|
| 57 |
+
main_warmup_end = 0.012 # 1.2% of total tokens
|
| 58 |
+
main_decay_start = 0.80 # 80% of total tokens
|
| 59 |
+
main_end = 0.95 # 95% of total tokens
|
| 60 |
+
|
| 61 |
+
# Ablation run parameters - start earlier so it reaches 0 at 7.1T
|
| 62 |
+
ablation_start = 0.55 # Start earlier
|
| 63 |
+
ablation_end = 0.645 # End at 7.1T (64.5% of 11T)
|
| 64 |
+
|
| 65 |
+
schedules = []
|
| 66 |
+
|
| 67 |
+
# Main pretraining run
|
| 68 |
+
for i in range(1000):
|
| 69 |
+
progress = i / 999
|
| 70 |
+
|
| 71 |
+
if progress < main_warmup_end:
|
| 72 |
+
lr = max_lr * (progress / main_warmup_end)
|
| 73 |
+
elif progress < main_decay_start:
|
| 74 |
+
lr = max_lr
|
| 75 |
+
elif progress < main_end:
|
| 76 |
+
# Linear decay
|
| 77 |
+
decay_progress = (progress - main_decay_start) / (main_end - main_decay_start)
|
| 78 |
+
lr = max_lr * (1 - decay_progress)
|
| 79 |
+
else:
|
| 80 |
+
lr = 0
|
| 81 |
+
|
| 82 |
+
tokens = progress * total_tokens
|
| 83 |
+
schedules.append({
|
| 84 |
+
'run_name': 'Main pretraining',
|
| 85 |
+
'tokens': tokens,
|
| 86 |
+
'learning_rate': lr
|
| 87 |
+
})
|
| 88 |
+
|
| 89 |
+
# Ablation run (identical to main pretraining until decay starts at 7.1T)
|
| 90 |
+
for i in range(1000):
|
| 91 |
+
progress = i / 999
|
| 92 |
+
|
| 93 |
+
if progress < main_warmup_end:
|
| 94 |
+
# Same warmup as main pretraining
|
| 95 |
+
lr = max_lr * (progress / main_warmup_end)
|
| 96 |
+
elif progress < ablation_start:
|
| 97 |
+
# Same plateau as main pretraining
|
| 98 |
+
lr = max_lr
|
| 99 |
+
elif progress < ablation_end:
|
| 100 |
+
# Linear decay during ablation period (starts at 7.1T)
|
| 101 |
+
decay_progress = (progress - ablation_start) / (ablation_end - ablation_start)
|
| 102 |
+
lr = max_lr * (1 - decay_progress)
|
| 103 |
+
else:
|
| 104 |
+
lr = 0
|
| 105 |
+
|
| 106 |
+
tokens = progress * total_tokens
|
| 107 |
+
schedules.append({
|
| 108 |
+
'run_name': 'Ablation decay',
|
| 109 |
+
'tokens': tokens,
|
| 110 |
+
'learning_rate': lr
|
| 111 |
+
})
|
| 112 |
+
|
| 113 |
+
# Filter out points after learning rate reaches 0 for each series
|
| 114 |
+
filtered_schedules = []
|
| 115 |
+
main_pretraining_data = [s for s in schedules if s['run_name'] == 'Main pretraining']
|
| 116 |
+
ablation_decay_data = [s for s in schedules if s['run_name'] == 'Ablation decay']
|
| 117 |
+
|
| 118 |
+
# Filter main pretraining - keep all points until 11T
|
| 119 |
+
for point in main_pretraining_data:
|
| 120 |
+
filtered_schedules.append(point)
|
| 121 |
+
# Stop when learning rate reaches 0 (should be around 11T)
|
| 122 |
+
if point['learning_rate'] == 0 and len([s for s in filtered_schedules if s['run_name'] == 'Main pretraining']) > 1:
|
| 123 |
+
break
|
| 124 |
+
|
| 125 |
+
# Filter ablation decay
|
| 126 |
+
for point in ablation_decay_data:
|
| 127 |
+
filtered_schedules.append(point)
|
| 128 |
+
if point['learning_rate'] == 0 and len([s for s in filtered_schedules if s['run_name'] == 'Ablation decay']) > 1:
|
| 129 |
+
break
|
| 130 |
+
|
| 131 |
+
return filtered_schedules
|
| 132 |
+
|
| 133 |
+
def main():
|
| 134 |
+
# Create output directory if it doesn't exist
|
| 135 |
+
output_dir = "src/content/assets/data"
|
| 136 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 137 |
+
|
| 138 |
+
print("Generating ablation study data...")
|
| 139 |
+
|
| 140 |
+
# Generate from scratch schedule
|
| 141 |
+
from_scratch_data = generate_from_scratch_schedule()
|
| 142 |
+
df_from_scratch = pd.DataFrame(from_scratch_data)
|
| 143 |
+
df_from_scratch.to_csv(f'{output_dir}/from_scratch_ablation.csv', index=False)
|
| 144 |
+
print(f"✓ Saved {output_dir}/from_scratch_ablation.csv with {len(df_from_scratch)} rows")
|
| 145 |
+
|
| 146 |
+
# Generate annealing schedules
|
| 147 |
+
annealing_data = generate_annealing_schedules()
|
| 148 |
+
df_annealing = pd.DataFrame(annealing_data)
|
| 149 |
+
df_annealing.to_csv(f'{output_dir}/annealing_ablation.csv', index=False)
|
| 150 |
+
print(f"✓ Saved {output_dir}/annealing_ablation.csv with {len(df_annealing)} rows")
|
| 151 |
+
|
| 152 |
+
print("\n✓ Done! CSV files generated successfully.")
|
| 153 |
+
print("\nNext steps:")
|
| 154 |
+
print("1. Use from_scratch_ablation.csv for the first plot")
|
| 155 |
+
print("2. Use annealing_ablation.csv for the second plot")
|
| 156 |
+
|
| 157 |
+
if __name__ == "__main__":
|
| 158 |
+
main()
|
app/scripts/generate_ablation_data_fixed.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generate ablation study data for D3 line chart embeds.
|
| 4 |
+
|
| 5 |
+
This script generates CSV files for:
|
| 6 |
+
1. From scratch ablation - single learning rate schedule
|
| 7 |
+
2. Annealing ablation - comparison between main pretraining and ablation decay
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
# Parameters
|
| 15 |
+
total_tokens = 100e9 # 100B tokens
|
| 16 |
+
max_lr = 2e-4
|
| 17 |
+
|
| 18 |
+
def generate_from_scratch_schedule():
|
| 19 |
+
"""Generate from scratch learning rate schedule"""
|
| 20 |
+
# Based on the HTML: warmup to 5%, plateau until 85%, then decay
|
| 21 |
+
warmup_end = 0.05 # 5% of total tokens
|
| 22 |
+
decay_start = 0.85 # 85% of total tokens
|
| 23 |
+
|
| 24 |
+
schedule = []
|
| 25 |
+
for i in range(1000): # 1000 points for smooth curve
|
| 26 |
+
progress = i / 999 # 0 to 1
|
| 27 |
+
|
| 28 |
+
if progress < warmup_end:
|
| 29 |
+
# Linear warmup
|
| 30 |
+
lr = max_lr * (progress / warmup_end)
|
| 31 |
+
elif progress < decay_start:
|
| 32 |
+
# Plateau at max LR
|
| 33 |
+
lr = max_lr
|
| 34 |
+
else:
|
| 35 |
+
# Linear decay to 0
|
| 36 |
+
decay_progress = (progress - decay_start) / (1 - decay_start)
|
| 37 |
+
lr = max_lr * (1 - decay_progress)
|
| 38 |
+
|
| 39 |
+
tokens = progress * total_tokens
|
| 40 |
+
schedule.append({
|
| 41 |
+
'run_name': 'From scratch',
|
| 42 |
+
'tokens': tokens,
|
| 43 |
+
'learning_rate': lr
|
| 44 |
+
})
|
| 45 |
+
|
| 46 |
+
return schedule
|
| 47 |
+
|
| 48 |
+
def generate_annealing_schedules():
|
| 49 |
+
"""Generate annealing ablation schedules"""
|
| 50 |
+
# Main pretraining run parameters
|
| 51 |
+
main_warmup_end = 0.012 # 1.2% of total tokens
|
| 52 |
+
main_decay_start = 0.80 # 80% of total tokens
|
| 53 |
+
main_end = 0.95 # 95% of total tokens
|
| 54 |
+
|
| 55 |
+
# Ablation run parameters
|
| 56 |
+
ablation_start = 0.64 # 64% of total tokens
|
| 57 |
+
ablation_duration = 0.10 # 10% of total tokens
|
| 58 |
+
ablation_end = ablation_start + ablation_duration
|
| 59 |
+
|
| 60 |
+
schedules = []
|
| 61 |
+
|
| 62 |
+
# Main pretraining run
|
| 63 |
+
for i in range(1000):
|
| 64 |
+
progress = i / 999
|
| 65 |
+
|
| 66 |
+
if progress < main_warmup_end:
|
| 67 |
+
lr = max_lr * (progress / main_warmup_end)
|
| 68 |
+
elif progress < main_decay_start:
|
| 69 |
+
lr = max_lr
|
| 70 |
+
elif progress < main_end:
|
| 71 |
+
# Linear decay
|
| 72 |
+
decay_progress = (progress - main_decay_start) / (main_end - main_decay_start)
|
| 73 |
+
lr = max_lr * (1 - decay_progress)
|
| 74 |
+
else:
|
| 75 |
+
lr = 0
|
| 76 |
+
|
| 77 |
+
tokens = progress * total_tokens
|
| 78 |
+
schedules.append({
|
| 79 |
+
'run_name': 'Main pretraining',
|
| 80 |
+
'tokens': tokens,
|
| 81 |
+
'learning_rate': lr
|
| 82 |
+
})
|
| 83 |
+
|
| 84 |
+
# Ablation run (starts from plateau and decays)
|
| 85 |
+
for i in range(1000):
|
| 86 |
+
progress = i / 999
|
| 87 |
+
|
| 88 |
+
if progress < ablation_start:
|
| 89 |
+
lr = max_lr # Plateau
|
| 90 |
+
elif progress < ablation_end:
|
| 91 |
+
# Linear decay during ablation period
|
| 92 |
+
decay_progress = (progress - ablation_start) / (ablation_end - ablation_start)
|
| 93 |
+
lr = max_lr * (1 - decay_progress)
|
| 94 |
+
else:
|
| 95 |
+
lr = 0
|
| 96 |
+
|
| 97 |
+
tokens = progress * total_tokens
|
| 98 |
+
schedules.append({
|
| 99 |
+
'run_name': 'Ablation decay',
|
| 100 |
+
'tokens': tokens,
|
| 101 |
+
'learning_rate': lr
|
| 102 |
+
})
|
| 103 |
+
|
| 104 |
+
return schedules
|
| 105 |
+
|
| 106 |
+
def main():
|
| 107 |
+
# Create output directory if it doesn't exist
|
| 108 |
+
output_dir = "src/content/assets/data"
|
| 109 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 110 |
+
|
| 111 |
+
print("Generating ablation study data...")
|
| 112 |
+
|
| 113 |
+
# Generate from scratch schedule
|
| 114 |
+
from_scratch_data = generate_from_scratch_schedule()
|
| 115 |
+
df_from_scratch = pd.DataFrame(from_scratch_data)
|
| 116 |
+
df_from_scratch.to_csv(f'{output_dir}/from_scratch_ablation.csv', index=False)
|
| 117 |
+
print(f"✓ Saved {output_dir}/from_scratch_ablation.csv with {len(df_from_scratch)} rows")
|
| 118 |
+
|
| 119 |
+
# Generate annealing schedules
|
| 120 |
+
annealing_data = generate_annealing_schedules()
|
| 121 |
+
df_annealing = pd.DataFrame(annealing_data)
|
| 122 |
+
df_annealing.to_csv(f'{output_dir}/annealing_ablation.csv', index=False)
|
| 123 |
+
print(f"✓ Saved {output_dir}/annealing_ablation.csv with {len(df_annealing)} rows")
|
| 124 |
+
|
| 125 |
+
print("\n✓ Done! CSV files generated successfully.")
|
| 126 |
+
print("\nNext steps:")
|
| 127 |
+
print("1. Use from_scratch_ablation.csv for the first plot")
|
| 128 |
+
print("2. Use annealing_ablation.csv for the second plot")
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
main()
|
app/scripts/jitter-trackio-data.mjs
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
// Jitter Trackio CSV data with small, controlled noise.
|
| 4 |
+
// - Preserves comments (# ...) and blank lines
|
| 5 |
+
// - Leaves 'epoch' values unchanged
|
| 6 |
+
// - Adds mild noise to train/val accuracy (clamped to [0,1])
|
| 7 |
+
// - Adds mild noise to train/val loss (kept >= 0)
|
| 8 |
+
// - Keeps steps untouched
|
| 9 |
+
// Usage:
|
| 10 |
+
// node app/scripts/jitter-trackio-data.mjs \
|
| 11 |
+
// --in app/src/content/assets/data/trackio_wandb_demo.csv \
|
| 12 |
+
// --out app/src/content/assets/data/trackio_wandb_demo.jitter.csv \
|
| 13 |
+
// [--seed 42] [--amount 1.0] [--in-place]
|
| 14 |
+
|
| 15 |
+
import fs from 'node:fs/promises';
|
| 16 |
+
import path from 'node:path';
|
| 17 |
+
|
| 18 |
+
function parseArgs(argv){
|
| 19 |
+
const args = { in: '', out: '', seed: undefined, amount: 1, inPlace: false };
|
| 20 |
+
for (let i = 2; i < argv.length; i++){
|
| 21 |
+
const a = argv[i];
|
| 22 |
+
if (a === '--in' && argv[i+1]) { args.in = argv[++i]; continue; }
|
| 23 |
+
if (a === '--out' && argv[i+1]) { args.out = argv[++i]; continue; }
|
| 24 |
+
if (a === '--seed' && argv[i+1]) { args.seed = Number(argv[++i]); continue; }
|
| 25 |
+
if (a === '--amount' && argv[i+1]) { args.amount = Number(argv[++i]) || 3; continue; }
|
| 26 |
+
if (a === '--in-place') { args.inPlace = true; continue; }
|
| 27 |
+
}
|
| 28 |
+
if (!args.in) throw new Error('--in is required');
|
| 29 |
+
if (args.inPlace) args.out = args.in;
|
| 30 |
+
if (!args.out) {
|
| 31 |
+
const { dir, name, ext } = path.parse(args.in);
|
| 32 |
+
args.out = path.join(dir, `${name}.jitter${ext || '.csv'}`);
|
| 33 |
+
}
|
| 34 |
+
return args;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
function mulberry32(seed){
|
| 38 |
+
let t = seed >>> 0;
|
| 39 |
+
return function(){
|
| 40 |
+
t += 0x6D2B79F5;
|
| 41 |
+
let r = Math.imul(t ^ (t >>> 15), 1 | t);
|
| 42 |
+
r ^= r + Math.imul(r ^ (r >>> 7), 61 | r);
|
| 43 |
+
return ((r ^ (r >>> 14)) >>> 0) / 4294967296;
|
| 44 |
+
};
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
function makeRng(seed){
|
| 48 |
+
if (Number.isFinite(seed)) return mulberry32(seed);
|
| 49 |
+
return Math.random;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
function randn(rng){
|
| 53 |
+
// Box-Muller transform
|
| 54 |
+
let u = 0, v = 0;
|
| 55 |
+
while (u === 0) u = rng();
|
| 56 |
+
while (v === 0) v = rng();
|
| 57 |
+
return Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v);
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
function jitterValue(metric, value, amount, rng){
|
| 61 |
+
const m = metric.toLowerCase();
|
| 62 |
+
if (m === 'epoch') return value; // keep as-is
|
| 63 |
+
if (m.includes('accuracy')){
|
| 64 |
+
const n = Math.max(-0.02 * amount, Math.min(0.02 * amount, randn(rng) * 0.01 * amount));
|
| 65 |
+
return Math.max(0, Math.min(1, value + n));
|
| 66 |
+
}
|
| 67 |
+
if (m.includes('loss')){
|
| 68 |
+
const n = Math.max(-0.03 * amount, Math.min(0.03 * amount, randn(rng) * 0.01 * amount));
|
| 69 |
+
return Math.max(0, value + n);
|
| 70 |
+
}
|
| 71 |
+
// default: tiny noise
|
| 72 |
+
const n = Math.max(-0.01 * amount, Math.min(0.01 * amount, randn(rng) * 0.005 * amount));
|
| 73 |
+
return value + n;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
function formatNumberLike(original, value){
|
| 77 |
+
const s = String(original);
|
| 78 |
+
const dot = s.indexOf('.')
|
| 79 |
+
const decimals = dot >= 0 ? (s.length - dot - 1) : 0;
|
| 80 |
+
if (!Number.isFinite(value)) return s;
|
| 81 |
+
if (decimals <= 0) return String(Math.round(value));
|
| 82 |
+
return value.toFixed(decimals);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
async function main(){
|
| 86 |
+
const args = parseArgs(process.argv);
|
| 87 |
+
const rng = makeRng(args.seed);
|
| 88 |
+
const raw = await fs.readFile(args.in, 'utf8');
|
| 89 |
+
const lines = raw.split(/\r?\n/);
|
| 90 |
+
const out = new Array(lines.length);
|
| 91 |
+
|
| 92 |
+
for (let i = 0; i < lines.length; i++){
|
| 93 |
+
const line = lines[i];
|
| 94 |
+
if (!line || line.trim().length === 0) { out[i] = line; continue; }
|
| 95 |
+
if (/^\s*#/.test(line)) { out[i] = line; continue; }
|
| 96 |
+
|
| 97 |
+
// Preserve header line unmodified
|
| 98 |
+
if (i === 0 && /^\s*run\s*,\s*step\s*,\s*metric\s*,\s*value\s*,\s*stderr\s*$/i.test(line)) {
|
| 99 |
+
out[i] = line; continue;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
const cols = line.split(',');
|
| 103 |
+
if (cols.length < 4) { out[i] = line; continue; }
|
| 104 |
+
|
| 105 |
+
const [run, stepStr, metric, valueStr, stderrStr = ''] = cols;
|
| 106 |
+
const trimmedMetric = (metric || '').trim();
|
| 107 |
+
const valueNum = Number((valueStr || '').trim());
|
| 108 |
+
|
| 109 |
+
if (!Number.isFinite(valueNum)) { out[i] = line; continue; }
|
| 110 |
+
|
| 111 |
+
const jittered = jitterValue(trimmedMetric, valueNum, args.amount, rng);
|
| 112 |
+
const valueOut = formatNumberLike(valueStr, jittered);
|
| 113 |
+
|
| 114 |
+
// Reassemble with original column count and positions
|
| 115 |
+
const result = [run, stepStr, metric, valueOut, stderrStr].join(',');
|
| 116 |
+
out[i] = result;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
const finalText = out.join('\n');
|
| 120 |
+
await fs.writeFile(args.out, finalText, 'utf8');
|
| 121 |
+
const relIn = path.relative(process.cwd(), args.in);
|
| 122 |
+
const relOut = path.relative(process.cwd(), args.out);
|
| 123 |
+
console.log(`Jittered data written: ${relOut} (from ${relIn})`);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
main().catch(err => {
|
| 127 |
+
console.error(err?.stack || String(err));
|
| 128 |
+
process.exit(1);
|
| 129 |
+
});
|
app/scripts/latex-importer/README.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LaTeX Importer
|
| 2 |
+
|
| 3 |
+
Complete LaTeX to MDX (Markdown + JSX) importer optimized for Astro with advanced support for references, interactive equations, and components.
|
| 4 |
+
|
| 5 |
+
## 🚀 Quick Start
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
# Complete LaTeX → MDX conversion with all features
|
| 9 |
+
node index.mjs
|
| 10 |
+
|
| 11 |
+
# For step-by-step debugging
|
| 12 |
+
node latex-converter.mjs # LaTeX → Markdown
|
| 13 |
+
node mdx-converter.mjs # Markdown → MDX
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## 📁 Structure
|
| 17 |
+
|
| 18 |
+
```
|
| 19 |
+
latex-importer/
|
| 20 |
+
├── index.mjs # Complete LaTeX → MDX pipeline
|
| 21 |
+
├── latex-converter.mjs # LaTeX → Markdown with Pandoc
|
| 22 |
+
├── mdx-converter.mjs # Markdown → MDX with Astro components
|
| 23 |
+
├── reference-preprocessor.mjs # LaTeX references cleanup
|
| 24 |
+
├── post-processor.mjs # Markdown post-processing
|
| 25 |
+
├── bib-cleaner.mjs # Bibliography cleaner
|
| 26 |
+
├── filters/
|
| 27 |
+
│ └── equation-ids.lua # Pandoc filter for KaTeX equations
|
| 28 |
+
├── input/ # LaTeX sources
|
| 29 |
+
│ ├── main.tex
|
| 30 |
+
│ ├── main.bib
|
| 31 |
+
│ └── sections/
|
| 32 |
+
└── output/ # Results
|
| 33 |
+
├── main.md # Intermediate Markdown
|
| 34 |
+
└── main.mdx # Final MDX for Astro
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## ✨ Key Features
|
| 38 |
+
|
| 39 |
+
### 🎯 **Smart References**
|
| 40 |
+
- **Invisible anchors**: Automatic conversion of `\label{}` to `<span id="..." style="position: absolute;"></span>`
|
| 41 |
+
- **Clean links**: Identifier cleanup (`:` → `-`, removing prefixes `sec:`, `fig:`, `eq:`)
|
| 42 |
+
- **Cross-references**: Full support for `\ref{}` with functional links
|
| 43 |
+
|
| 44 |
+
### 🧮 **Interactive Equations**
|
| 45 |
+
- **KaTeX IDs**: Conversion of `\label{eq:...}` to `\htmlId{id}{equation}`
|
| 46 |
+
- **Equation references**: Clickable links to mathematical equations
|
| 47 |
+
- **Advanced KaTeX support**: `trust: true` configuration for `\htmlId{}`
|
| 48 |
+
|
| 49 |
+
### 🎨 **Automatic Styling**
|
| 50 |
+
- **Highlights**: `\highlight{text}` → `<span class="highlight">text</span>`
|
| 51 |
+
- **Auto cleanup**: Removal of numbering `(1)`, `(2)`, etc.
|
| 52 |
+
- **Astro components**: Images → `Figure` with automatic imports
|
| 53 |
+
|
| 54 |
+
### 🔧 **Robust Pipeline**
|
| 55 |
+
- **LaTeX preprocessor**: Reference cleanup before Pandoc
|
| 56 |
+
- **Lua filter**: Equation processing in Pandoc AST
|
| 57 |
+
- **Post-processor**: Markdown cleanup and optimization
|
| 58 |
+
- **MDX converter**: Final transformation with Astro components
|
| 59 |
+
|
| 60 |
+
## 📊 Example Workflow
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
# 1. Prepare LaTeX sources
|
| 64 |
+
cp my-paper/* input/
|
| 65 |
+
|
| 66 |
+
# 2. Complete automatic conversion
|
| 67 |
+
node index.mjs
|
| 68 |
+
|
| 69 |
+
# 3. Generated results
|
| 70 |
+
ls output/
|
| 71 |
+
# → main.md (Intermediate Markdown)
|
| 72 |
+
# → main.mdx (Final MDX for Astro)
|
| 73 |
+
# → assets/image/ (extracted images)
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### 📋 Conversion Result
|
| 77 |
+
|
| 78 |
+
The pipeline generates an MDX file optimized for Astro with:
|
| 79 |
+
|
| 80 |
+
```mdx
|
| 81 |
+
---
|
| 82 |
+
title: "Your Article Title"
|
| 83 |
+
description: "Generated from LaTeX"
|
| 84 |
+
---
|
| 85 |
+
|
| 86 |
+
import Figure from '../components/Figure.astro';
|
| 87 |
+
import figure1 from '../assets/image/figure1.png';
|
| 88 |
+
|
| 89 |
+
## Section with invisible anchor
|
| 90 |
+
<span id="introduction" style="position: absolute;"></span>
|
| 91 |
+
|
| 92 |
+
Here is some text with <span class="highlight">highlighted words</span>.
|
| 93 |
+
|
| 94 |
+
Reference to an interactive [equation](#equation-name).
|
| 95 |
+
|
| 96 |
+
Equation with KaTeX ID:
|
| 97 |
+
$$\htmlId{equation-name}{E = mc^2}$$
|
| 98 |
+
|
| 99 |
+
<Figure src={figure1} alt="Description" />
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## ⚙️ Required Astro Configuration
|
| 103 |
+
|
| 104 |
+
To use equations with IDs, add to `astro.config.mjs`:
|
| 105 |
+
|
| 106 |
+
```javascript
|
| 107 |
+
import rehypeKatex from 'rehype-katex';
|
| 108 |
+
|
| 109 |
+
export default defineConfig({
|
| 110 |
+
markdown: {
|
| 111 |
+
rehypePlugins: [
|
| 112 |
+
[rehypeKatex, { trust: true }], // ← Important for \htmlId{}
|
| 113 |
+
],
|
| 114 |
+
},
|
| 115 |
+
});
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
## 🛠️ Prerequisites
|
| 119 |
+
|
| 120 |
+
- **Node.js** with ESM support
|
| 121 |
+
- **Pandoc** (`brew install pandoc`)
|
| 122 |
+
- **Astro** to use the generated MDX
|
| 123 |
+
|
| 124 |
+
## 🎯 Technical Architecture
|
| 125 |
+
|
| 126 |
+
### 4-Stage Pipeline
|
| 127 |
+
|
| 128 |
+
1. **LaTeX Preprocessing** (`reference-preprocessor.mjs`)
|
| 129 |
+
- Cleanup of `\label{}` and `\ref{}`
|
| 130 |
+
- Conversion `\highlight{}` → CSS spans
|
| 131 |
+
- Removal of prefixes and problematic characters
|
| 132 |
+
|
| 133 |
+
2. **Pandoc + Lua Filter** (`equation-ids.lua`)
|
| 134 |
+
- LaTeX → Markdown conversion with `gfm+tex_math_dollars+raw_html`
|
| 135 |
+
- Equation processing: `\label{eq:name}` → `\htmlId{name}{equation}`
|
| 136 |
+
- Automatic image extraction
|
| 137 |
+
|
| 138 |
+
3. **Markdown Post-processing** (`post-processor.mjs`)
|
| 139 |
+
- KaTeX, Unicode, grouping commands cleanup
|
| 140 |
+
- Attribute correction with `:`
|
| 141 |
+
- Code snippet injection
|
| 142 |
+
|
| 143 |
+
4. **MDX Conversion** (`mdx-converter.mjs`)
|
| 144 |
+
- Images transformation → `Figure`
|
| 145 |
+
- HTML span escaping correction
|
| 146 |
+
- Automatic imports generation
|
| 147 |
+
- MDX frontmatter
|
| 148 |
+
|
| 149 |
+
## 📊 Conversion Statistics
|
| 150 |
+
|
| 151 |
+
For a typical scientific document:
|
| 152 |
+
- **87 labels** detected and processed
|
| 153 |
+
- **48 invisible anchors** created
|
| 154 |
+
- **13 highlight spans** with CSS class
|
| 155 |
+
- **4 equations** with `\htmlId{}` KaTeX
|
| 156 |
+
- **40 images** converted to components
|
| 157 |
+
|
| 158 |
+
## ✅ Project Status
|
| 159 |
+
|
| 160 |
+
### 🎉 **Complete Features**
|
| 161 |
+
- ✅ **LaTeX → MDX Pipeline**: Full end-to-end functional conversion
|
| 162 |
+
- ✅ **Cross-document references**: Perfectly functional internal links
|
| 163 |
+
- ✅ **Interactive equations**: KaTeX support with clickable IDs
|
| 164 |
+
- ✅ **Automatic styling**: Highlights and Astro components
|
| 165 |
+
- ✅ **Robustness**: Automatic cleanup of all escaping
|
| 166 |
+
- ✅ **Optimization**: Clean code without unnecessary elements
|
| 167 |
+
|
| 168 |
+
### 🚀 **Production Ready**
|
| 169 |
+
The toolkit is now **100% operational** for converting complex scientific LaTeX documents to MDX/Astro with all advanced features (references, interactive equations, styling).
|
app/scripts/latex-importer/bib-cleaner.mjs
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
| 4 |
+
import { join, dirname, basename } from 'path';
|
| 5 |
+
|
| 6 |
+
/**
|
| 7 |
+
* Clean a BibTeX file by removing local file references and paths
|
| 8 |
+
* @param {string} inputBibFile - Path to the input .bib file
|
| 9 |
+
* @param {string} outputBibFile - Path to the output cleaned .bib file
|
| 10 |
+
* @returns {boolean} - Success status
|
| 11 |
+
*/
|
| 12 |
+
export function cleanBibliography(inputBibFile, outputBibFile) {
|
| 13 |
+
if (!existsSync(inputBibFile)) {
|
| 14 |
+
console.log(' ⚠️ No bibliography file found:', inputBibFile);
|
| 15 |
+
return false;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
console.log('📚 Cleaning bibliography...');
|
| 19 |
+
let bibContent = readFileSync(inputBibFile, 'utf8');
|
| 20 |
+
|
| 21 |
+
// Remove file paths and local references
|
| 22 |
+
bibContent = bibContent.replace(/file = \{[^}]+\}/g, '');
|
| 23 |
+
|
| 24 |
+
// Remove empty lines created by file removal
|
| 25 |
+
bibContent = bibContent.replace(/,\s*\n\s*\n/g, '\n\n');
|
| 26 |
+
bibContent = bibContent.replace(/,\s*\}/g, '\n}');
|
| 27 |
+
|
| 28 |
+
// Clean up double commas
|
| 29 |
+
bibContent = bibContent.replace(/,,/g, ',');
|
| 30 |
+
|
| 31 |
+
// Remove trailing commas before closing braces
|
| 32 |
+
bibContent = bibContent.replace(/,(\s*\n\s*)\}/g, '$1}');
|
| 33 |
+
|
| 34 |
+
writeFileSync(outputBibFile, bibContent);
|
| 35 |
+
console.log(` 📄 Clean bibliography saved: ${outputBibFile}`);
|
| 36 |
+
|
| 37 |
+
return true;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
/**
|
| 41 |
+
* CLI for bibliography cleaning
|
| 42 |
+
*/
|
| 43 |
+
function main() {
|
| 44 |
+
const args = process.argv.slice(2);
|
| 45 |
+
|
| 46 |
+
if (args.includes('--help') || args.includes('-h')) {
|
| 47 |
+
console.log(`
|
| 48 |
+
📚 BibTeX Bibliography Cleaner
|
| 49 |
+
|
| 50 |
+
Usage:
|
| 51 |
+
node bib-cleaner.mjs [input.bib] [output.bib]
|
| 52 |
+
node bib-cleaner.mjs --input=input.bib --output=output.bib
|
| 53 |
+
|
| 54 |
+
Options:
|
| 55 |
+
--input=FILE Input .bib file
|
| 56 |
+
--output=FILE Output cleaned .bib file
|
| 57 |
+
--help, -h Show this help
|
| 58 |
+
|
| 59 |
+
Examples:
|
| 60 |
+
# Clean main.bib to clean.bib
|
| 61 |
+
node bib-cleaner.mjs main.bib clean.bib
|
| 62 |
+
|
| 63 |
+
# Using flags
|
| 64 |
+
node bib-cleaner.mjs --input=references.bib --output=clean-refs.bib
|
| 65 |
+
`);
|
| 66 |
+
process.exit(0);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
let inputFile, outputFile;
|
| 70 |
+
|
| 71 |
+
// Parse command line arguments
|
| 72 |
+
if (args.length >= 2 && !args[0].startsWith('--')) {
|
| 73 |
+
// Positional arguments
|
| 74 |
+
inputFile = args[0];
|
| 75 |
+
outputFile = args[1];
|
| 76 |
+
} else {
|
| 77 |
+
// Named arguments
|
| 78 |
+
for (const arg of args) {
|
| 79 |
+
if (arg.startsWith('--input=')) {
|
| 80 |
+
inputFile = arg.split('=')[1];
|
| 81 |
+
} else if (arg.startsWith('--output=')) {
|
| 82 |
+
outputFile = arg.split('=')[1];
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
if (!inputFile || !outputFile) {
|
| 88 |
+
console.error('❌ Both input and output files are required');
|
| 89 |
+
console.log('Use --help for usage information');
|
| 90 |
+
process.exit(1);
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
const success = cleanBibliography(inputFile, outputFile);
|
| 94 |
+
if (success) {
|
| 95 |
+
console.log('🎉 Bibliography cleaning completed!');
|
| 96 |
+
} else {
|
| 97 |
+
process.exit(1);
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
// Run CLI if called directly
|
| 102 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 103 |
+
main();
|
| 104 |
+
}
|
app/scripts/latex-importer/filters/equation-ids.lua
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--[[
|
| 2 |
+
Pandoc Lua filter to add IDs to equations using KaTeX \htmlId syntax
|
| 3 |
+
|
| 4 |
+
This filter processes display math equations and inline math that contain
|
| 5 |
+
\label{} commands, and wraps them with \htmlId{clean-id}{content} for KaTeX.
|
| 6 |
+
|
| 7 |
+
Requirements:
|
| 8 |
+
- KaTeX renderer with trust: true option
|
| 9 |
+
- Equations with \label{} commands in LaTeX
|
| 10 |
+
--]]
|
| 11 |
+
|
| 12 |
+
-- Function to clean identifier strings (remove prefixes and colons)
|
| 13 |
+
function clean_identifier(id_str)
|
| 14 |
+
if id_str and type(id_str) == "string" then
|
| 15 |
+
-- Remove common prefixes and replace colons with dashes
|
| 16 |
+
local clean = id_str
|
| 17 |
+
:gsub("^(eq|equation):", "") -- Remove eq: prefix
|
| 18 |
+
:gsub(":", "-") -- Replace colons with dashes
|
| 19 |
+
:gsub("[^a-zA-Z0-9_-]", "-") -- Replace other problematic chars
|
| 20 |
+
:gsub("-+", "-") -- Collapse multiple dashes
|
| 21 |
+
:gsub("^-", "") -- Remove leading dash
|
| 22 |
+
:gsub("-$", "") -- Remove trailing dash
|
| 23 |
+
|
| 24 |
+
-- Ensure we don't have empty identifiers
|
| 25 |
+
if clean == "" then
|
| 26 |
+
clean = id_str:gsub(":", "-")
|
| 27 |
+
end
|
| 28 |
+
|
| 29 |
+
return clean
|
| 30 |
+
end
|
| 31 |
+
return id_str
|
| 32 |
+
end
|
| 33 |
+
|
| 34 |
+
-- Process Math elements (both inline and display)
|
| 35 |
+
function Math(el)
|
| 36 |
+
local math_content = el.text
|
| 37 |
+
|
| 38 |
+
-- Look for \label{...} commands in the math content
|
| 39 |
+
local label_match = math_content:match("\\label%{([^}]+)%}")
|
| 40 |
+
|
| 41 |
+
if label_match then
|
| 42 |
+
-- Clean the identifier
|
| 43 |
+
local clean_id = clean_identifier(label_match)
|
| 44 |
+
|
| 45 |
+
-- Remove the \label{} command from the math content
|
| 46 |
+
local clean_math = math_content:gsub("\\label%{[^}]+%}", "")
|
| 47 |
+
|
| 48 |
+
-- Clean up any extra whitespace or line breaks that might remain
|
| 49 |
+
clean_math = clean_math:gsub("%s*$", ""):gsub("^%s*", "")
|
| 50 |
+
|
| 51 |
+
-- Handle different equation environments appropriately
|
| 52 |
+
-- For align environments, preserve them as they work with KaTeX
|
| 53 |
+
local has_align = clean_math:match("\\begin%{align%}")
|
| 54 |
+
|
| 55 |
+
if has_align then
|
| 56 |
+
-- For align environments, we keep the structure and add ID as an attribute
|
| 57 |
+
-- KaTeX supports align environments natively
|
| 58 |
+
clean_math = clean_math:gsub("\\begin%{align%}", "\\begin{align}")
|
| 59 |
+
clean_math = clean_math:gsub("\\end%{align%}", "\\end{align}")
|
| 60 |
+
else
|
| 61 |
+
-- Remove other equation environments that don't work well with \htmlId
|
| 62 |
+
clean_math = clean_math:gsub("\\begin%{equation%}", ""):gsub("\\end%{equation%}", "")
|
| 63 |
+
clean_math = clean_math:gsub("\\begin%{equation%*%}", ""):gsub("\\end%{equation%*%}", "")
|
| 64 |
+
clean_math = clean_math:gsub("\\begin%{align%*%}", ""):gsub("\\end%{align%*%}", "")
|
| 65 |
+
end
|
| 66 |
+
|
| 67 |
+
-- Clean up any remaining whitespace
|
| 68 |
+
clean_math = clean_math:gsub("%s*$", ""):gsub("^%s*", "")
|
| 69 |
+
|
| 70 |
+
local new_math
|
| 71 |
+
if has_align then
|
| 72 |
+
-- For align environments, KaTeX doesn't support \htmlId with align
|
| 73 |
+
-- Instead, we add a special marker that the post-processor will convert to a span
|
| 74 |
+
-- This span will serve as an anchor for references
|
| 75 |
+
new_math = "%%ALIGN_ANCHOR_ID{" .. clean_id .. "}%%\n" .. clean_math
|
| 76 |
+
else
|
| 77 |
+
-- For other math, wrap with \htmlId{}
|
| 78 |
+
new_math = "\\htmlId{" .. clean_id .. "}{" .. clean_math .. "}"
|
| 79 |
+
end
|
| 80 |
+
|
| 81 |
+
-- Return new Math element with the updated content
|
| 82 |
+
return pandoc.Math(el.mathtype, new_math)
|
| 83 |
+
end
|
| 84 |
+
|
| 85 |
+
-- Return unchanged if no label found
|
| 86 |
+
return el
|
| 87 |
+
end
|
| 88 |
+
|
| 89 |
+
-- Optional: Process RawInline elements that might contain LaTeX math
|
| 90 |
+
function RawInline(el)
|
| 91 |
+
if el.format == "latex" or el.format == "tex" then
|
| 92 |
+
local content = el.text
|
| 93 |
+
|
| 94 |
+
-- Look for equation environments with labels
|
| 95 |
+
local label_match = content:match("\\label%{([^}]+)%}")
|
| 96 |
+
|
| 97 |
+
if label_match then
|
| 98 |
+
local clean_id = clean_identifier(label_match)
|
| 99 |
+
|
| 100 |
+
-- For raw LaTeX, we might need different handling
|
| 101 |
+
-- This is a simplified approach - adjust based on your needs
|
| 102 |
+
local clean_content = content:gsub("\\label%{[^}]+%}", "")
|
| 103 |
+
|
| 104 |
+
if clean_content:match("\\begin%{equation") or clean_content:match("\\begin%{align") then
|
| 105 |
+
-- For equation environments, we might need to wrap differently
|
| 106 |
+
-- This depends on how your KaTeX setup handles equation environments
|
| 107 |
+
return pandoc.RawInline(el.format, clean_content)
|
| 108 |
+
end
|
| 109 |
+
end
|
| 110 |
+
end
|
| 111 |
+
|
| 112 |
+
return el
|
| 113 |
+
end
|
| 114 |
+
|
| 115 |
+
-- Optional: Process RawBlock elements for display equations
|
| 116 |
+
function RawBlock(el)
|
| 117 |
+
if el.format == "latex" or el.format == "tex" then
|
| 118 |
+
local content = el.text
|
| 119 |
+
|
| 120 |
+
-- Look for equation environments with labels
|
| 121 |
+
local label_match = content:match("\\label%{([^}]+)%}")
|
| 122 |
+
|
| 123 |
+
if label_match then
|
| 124 |
+
local clean_id = clean_identifier(label_match)
|
| 125 |
+
local clean_content = content:gsub("\\label%{[^}]+%}", "")
|
| 126 |
+
|
| 127 |
+
-- For block equations, we might want to preserve the structure
|
| 128 |
+
-- but add the htmlId functionality
|
| 129 |
+
return pandoc.RawBlock(el.format, clean_content)
|
| 130 |
+
end
|
| 131 |
+
end
|
| 132 |
+
|
| 133 |
+
return el
|
| 134 |
+
end
|
app/scripts/latex-importer/index.mjs
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { join, dirname } from 'path';
|
| 4 |
+
import { fileURLToPath } from 'url';
|
| 5 |
+
import { copyFileSync } from 'fs';
|
| 6 |
+
import { convertLatexToMarkdown } from './latex-converter.mjs';
|
| 7 |
+
import { convertToMdx } from './mdx-converter.mjs';
|
| 8 |
+
import { cleanBibliography } from './bib-cleaner.mjs';
|
| 9 |
+
|
| 10 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 11 |
+
const __dirname = dirname(__filename);
|
| 12 |
+
|
| 13 |
+
// Default configuration
|
| 14 |
+
const DEFAULT_INPUT = join(__dirname, 'input', 'main.tex');
|
| 15 |
+
const DEFAULT_OUTPUT = join(__dirname, 'output');
|
| 16 |
+
const ASTRO_CONTENT_PATH = join(__dirname, '..', '..', 'src', 'content', 'article.mdx');
|
| 17 |
+
|
| 18 |
+
function parseArgs() {
|
| 19 |
+
const args = process.argv.slice(2);
|
| 20 |
+
const config = {
|
| 21 |
+
input: DEFAULT_INPUT,
|
| 22 |
+
output: DEFAULT_OUTPUT,
|
| 23 |
+
clean: false,
|
| 24 |
+
bibOnly: false,
|
| 25 |
+
convertOnly: false,
|
| 26 |
+
mdx: false,
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
for (const arg of args) {
|
| 30 |
+
if (arg.startsWith('--input=')) {
|
| 31 |
+
config.input = arg.split('=')[1];
|
| 32 |
+
} else if (arg.startsWith('--output=')) {
|
| 33 |
+
config.output = arg.split('=')[1];
|
| 34 |
+
} else if (arg === '--clean') {
|
| 35 |
+
config.clean = true;
|
| 36 |
+
} else if (arg === '--bib-only') {
|
| 37 |
+
config.bibOnly = true;
|
| 38 |
+
} else if (arg === '--convert-only') {
|
| 39 |
+
config.convertOnly = true;
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
return config;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
function showHelp() {
|
| 47 |
+
console.log(`
|
| 48 |
+
🚀 LaTeX to Markdown Toolkit
|
| 49 |
+
|
| 50 |
+
Usage:
|
| 51 |
+
node index.mjs [options]
|
| 52 |
+
|
| 53 |
+
Options:
|
| 54 |
+
--input=PATH Input LaTeX file (default: input/main.tex)
|
| 55 |
+
--output=PATH Output directory (default: output/)
|
| 56 |
+
--clean Clean output directory before processing
|
| 57 |
+
--bib-only Only clean bibliography file
|
| 58 |
+
--convert-only Only convert LaTeX to Markdown (skip bib cleaning)
|
| 59 |
+
--help, -h Show this help
|
| 60 |
+
|
| 61 |
+
Examples:
|
| 62 |
+
# Full conversion with bibliography cleaning
|
| 63 |
+
node index.mjs --clean
|
| 64 |
+
|
| 65 |
+
# Only clean bibliography
|
| 66 |
+
node index.mjs --bib-only --input=paper.tex --output=clean/
|
| 67 |
+
|
| 68 |
+
# Only convert LaTeX (use existing clean bibliography)
|
| 69 |
+
node index.mjs --convert-only
|
| 70 |
+
|
| 71 |
+
# Custom paths
|
| 72 |
+
node index.mjs --input=../paper/main.tex --output=../results/ --clean
|
| 73 |
+
`);
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
function main() {
|
| 77 |
+
const args = process.argv.slice(2);
|
| 78 |
+
|
| 79 |
+
if (args.includes('--help') || args.includes('-h')) {
|
| 80 |
+
showHelp();
|
| 81 |
+
process.exit(0);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
const config = parseArgs();
|
| 85 |
+
|
| 86 |
+
console.log('🚀 LaTeX to Markdown Toolkit');
|
| 87 |
+
console.log('==============================');
|
| 88 |
+
|
| 89 |
+
try {
|
| 90 |
+
if (config.bibOnly) {
|
| 91 |
+
// Only clean bibliography
|
| 92 |
+
console.log('📚 Bibliography cleaning mode');
|
| 93 |
+
const bibInput = config.input.replace('.tex', '.bib');
|
| 94 |
+
const bibOutput = join(config.output, 'main.bib');
|
| 95 |
+
|
| 96 |
+
cleanBibliography(bibInput, bibOutput);
|
| 97 |
+
console.log('🎉 Bibliography cleaning completed!');
|
| 98 |
+
|
| 99 |
+
} else if (config.convertOnly) {
|
| 100 |
+
// Only convert LaTeX
|
| 101 |
+
console.log('📄 Conversion only mode');
|
| 102 |
+
convertLatexToMarkdown(config.input, config.output);
|
| 103 |
+
|
| 104 |
+
} else {
|
| 105 |
+
// Full workflow
|
| 106 |
+
console.log('🔄 Full conversion workflow');
|
| 107 |
+
convertLatexToMarkdown(config.input, config.output);
|
| 108 |
+
|
| 109 |
+
// Convert to MDX if requested
|
| 110 |
+
const markdownFile = join(config.output, 'main.md');
|
| 111 |
+
const mdxFile = join(config.output, 'main.mdx');
|
| 112 |
+
|
| 113 |
+
console.log('📝 Converting Markdown to MDX...');
|
| 114 |
+
convertToMdx(markdownFile, mdxFile);
|
| 115 |
+
|
| 116 |
+
// Copy MDX to Astro content directory
|
| 117 |
+
console.log('📋 Copying MDX to Astro content directory...');
|
| 118 |
+
try {
|
| 119 |
+
copyFileSync(mdxFile, ASTRO_CONTENT_PATH);
|
| 120 |
+
console.log(` ✅ Copied to ${ASTRO_CONTENT_PATH}`);
|
| 121 |
+
} catch (error) {
|
| 122 |
+
console.warn(` ⚠️ Failed to copy MDX to Astro: ${error.message}`);
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
} catch (error) {
|
| 127 |
+
console.error('❌ Error:', error.message);
|
| 128 |
+
process.exit(1);
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
// Export functions for use as module
|
| 133 |
+
export { convertLatexToMarkdown, cleanBibliography };
|
| 134 |
+
|
| 135 |
+
// Run CLI if called directly
|
| 136 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 137 |
+
main();
|
| 138 |
+
}
|
app/scripts/latex-importer/latex-converter.mjs
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { execSync } from 'child_process';
|
| 4 |
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
| 5 |
+
import { join, dirname, basename } from 'path';
|
| 6 |
+
import { fileURLToPath } from 'url';
|
| 7 |
+
import { cleanBibliography } from './bib-cleaner.mjs';
|
| 8 |
+
import { postProcessMarkdown } from './post-processor.mjs';
|
| 9 |
+
import { preprocessLatexReferences } from './reference-preprocessor.mjs';
|
| 10 |
+
|
| 11 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 12 |
+
const __dirname = dirname(__filename);
|
| 13 |
+
|
| 14 |
+
// Configuration
|
| 15 |
+
const DEFAULT_INPUT = join(__dirname, 'input', 'main.tex');
|
| 16 |
+
const DEFAULT_OUTPUT = join(__dirname, 'output');
|
| 17 |
+
|
| 18 |
+
function parseArgs() {
|
| 19 |
+
const args = process.argv.slice(2);
|
| 20 |
+
const config = {
|
| 21 |
+
input: DEFAULT_INPUT,
|
| 22 |
+
output: DEFAULT_OUTPUT,
|
| 23 |
+
clean: false
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
for (const arg of args) {
|
| 27 |
+
if (arg.startsWith('--input=')) {
|
| 28 |
+
config.input = arg.split('=')[1];
|
| 29 |
+
} else if (arg.startsWith('--output=')) {
|
| 30 |
+
config.output = arg.split('=')[1];
|
| 31 |
+
} else if (arg === '--clean') {
|
| 32 |
+
config.clean = true;
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
return config;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
function ensureDirectory(dir) {
|
| 40 |
+
if (!existsSync(dir)) {
|
| 41 |
+
mkdirSync(dir, { recursive: true });
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
function cleanDirectory(dir) {
|
| 46 |
+
if (existsSync(dir)) {
|
| 47 |
+
execSync(`rm -rf "${dir}"/*`, { stdio: 'inherit' });
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
function preprocessLatexFile(inputFile, outputDir) {
|
| 52 |
+
const inputDir = dirname(inputFile);
|
| 53 |
+
const tempFile = join(outputDir, 'temp_main.tex');
|
| 54 |
+
|
| 55 |
+
console.log('🔄 Preprocessing LaTeX file to resolve \\input commands...');
|
| 56 |
+
|
| 57 |
+
let content = readFileSync(inputFile, 'utf8');
|
| 58 |
+
|
| 59 |
+
// Remove problematic commands that break pandoc
|
| 60 |
+
console.log('🧹 Cleaning problematic LaTeX constructs...');
|
| 61 |
+
|
| 62 |
+
// Fix citation issues - but not in citation keys
|
| 63 |
+
content = content.replace(/\$p_0\$(?![A-Za-z])/g, 'p0');
|
| 64 |
+
|
| 65 |
+
// Convert complex math environments to simple delimiters
|
| 66 |
+
content = content.replace(/\$\$\\begin\{equation\*\}/g, '$$');
|
| 67 |
+
content = content.replace(/\\end\{equation\*\}\$\$/g, '$$');
|
| 68 |
+
content = content.replace(/\\begin\{equation\*\}/g, '$$');
|
| 69 |
+
content = content.replace(/\\end\{equation\*\}/g, '$$');
|
| 70 |
+
// Keep align environments intact for KaTeX support
|
| 71 |
+
// Protect align environments by temporarily replacing them before cleaning & operators
|
| 72 |
+
const alignBlocks = [];
|
| 73 |
+
content = content.replace(/\\begin\{align\}([\s\S]*?)\\end\{align\}/g, (match, alignContent) => {
|
| 74 |
+
alignBlocks.push(match);
|
| 75 |
+
return `__ALIGN_BLOCK_${alignBlocks.length - 1}__`;
|
| 76 |
+
});
|
| 77 |
+
|
| 78 |
+
// Now remove & operators from non-align content (outside align environments)
|
| 79 |
+
content = content.replace(/&=/g, '=');
|
| 80 |
+
content = content.replace(/&/g, '');
|
| 81 |
+
|
| 82 |
+
// Restore align blocks with their & operators intact
|
| 83 |
+
alignBlocks.forEach((block, index) => {
|
| 84 |
+
content = content.replace(`__ALIGN_BLOCK_${index}__`, block);
|
| 85 |
+
});
|
| 86 |
+
|
| 87 |
+
// Convert LaTeX citations to Pandoc format
|
| 88 |
+
content = content.replace(/\\cite[tp]?\{([^}]+)\}/g, (match, citations) => {
|
| 89 |
+
// Handle multiple citations separated by commas - all become simple @citations
|
| 90 |
+
return citations.split(',').map(cite => `@${cite.trim()}`).join(', ');
|
| 91 |
+
});
|
| 92 |
+
|
| 93 |
+
// Handle complex \textsc with nested math - extract and simplify (but not in command definitions)
|
| 94 |
+
content = content.replace(/\\textsc\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, content_inside, offset) => {
|
| 95 |
+
// Skip if this is inside a \newcommand or similar definition
|
| 96 |
+
const before = content.substring(Math.max(0, offset - 50), offset);
|
| 97 |
+
if (before.includes('\\newcommand') || before.includes('\\renewcommand') || before.includes('\\def')) {
|
| 98 |
+
return match; // Keep original
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
// Remove math delimiters inside textsc for simplification
|
| 102 |
+
const simplified = content_inside.replace(/\\\([^)]+\\\)/g, 'MATHEXPR');
|
| 103 |
+
return `\\text{${simplified}}`;
|
| 104 |
+
});
|
| 105 |
+
|
| 106 |
+
// Remove complex custom commands that pandoc can't handle
|
| 107 |
+
content = content.replace(/\\input\{snippets\/[^}]+\}/g, '% Code snippet removed');
|
| 108 |
+
|
| 109 |
+
// Find all \input{} commands (but skip commented ones)
|
| 110 |
+
const inputRegex = /^([^%]*?)\\input\{([^}]+)\}/gm;
|
| 111 |
+
let match;
|
| 112 |
+
|
| 113 |
+
while ((match = inputRegex.exec(content)) !== null) {
|
| 114 |
+
const beforeInput = match[1];
|
| 115 |
+
const inputPath = match[2];
|
| 116 |
+
|
| 117 |
+
// Skip if the \input is commented (% appears before \input on the line)
|
| 118 |
+
if (beforeInput.includes('%')) {
|
| 119 |
+
continue;
|
| 120 |
+
}
|
| 121 |
+
let fullPath;
|
| 122 |
+
|
| 123 |
+
// Skip only problematic files, let Pandoc handle macros
|
| 124 |
+
if (inputPath.includes('snippets/')) {
|
| 125 |
+
console.log(` Skipping: ${inputPath}`);
|
| 126 |
+
content = content.replace(`\\input{${inputPath}}`, `% Skipped: ${inputPath}`);
|
| 127 |
+
continue;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// Handle paths with or without .tex extension
|
| 131 |
+
if (inputPath.endsWith('.tex')) {
|
| 132 |
+
fullPath = join(inputDir, inputPath);
|
| 133 |
+
} else {
|
| 134 |
+
fullPath = join(inputDir, inputPath + '.tex');
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
if (existsSync(fullPath)) {
|
| 138 |
+
console.log(` Including: ${inputPath}`);
|
| 139 |
+
let includedContent = readFileSync(fullPath, 'utf8');
|
| 140 |
+
|
| 141 |
+
// Clean included content too
|
| 142 |
+
includedContent = includedContent.replace(/\$p_0\$/g, 'p0');
|
| 143 |
+
includedContent = includedContent.replace(/\\input\{snippets\/[^}]+\}/g, '% Code snippet removed');
|
| 144 |
+
|
| 145 |
+
// Handle complex \textsc in included content
|
| 146 |
+
includedContent = includedContent.replace(/\\textsc\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, content_inside, offset) => {
|
| 147 |
+
// Skip if this is inside a \newcommand or similar definition
|
| 148 |
+
const before = includedContent.substring(Math.max(0, offset - 50), offset);
|
| 149 |
+
if (before.includes('\\newcommand') || before.includes('\\renewcommand') || before.includes('\\def')) {
|
| 150 |
+
return match; // Keep original
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
const simplified = content_inside.replace(/\\\([^)]+\\\)/g, 'MATHEXPR');
|
| 154 |
+
return `\\text{${simplified}}`;
|
| 155 |
+
});
|
| 156 |
+
|
| 157 |
+
// Apply same align-preserving logic to included content
|
| 158 |
+
const alignBlocksIncluded = [];
|
| 159 |
+
includedContent = includedContent.replace(/\\begin\{align\}([\s\S]*?)\\end\{align\}/g, (match, alignContent) => {
|
| 160 |
+
alignBlocksIncluded.push(match);
|
| 161 |
+
return `__ALIGN_BLOCK_${alignBlocksIncluded.length - 1}__`;
|
| 162 |
+
});
|
| 163 |
+
|
| 164 |
+
// Remove alignment operators from non-align content in included files
|
| 165 |
+
includedContent = includedContent.replace(/&=/g, '=');
|
| 166 |
+
includedContent = includedContent.replace(/&/g, '');
|
| 167 |
+
|
| 168 |
+
// Restore align blocks with their & operators intact
|
| 169 |
+
alignBlocksIncluded.forEach((block, index) => {
|
| 170 |
+
includedContent = includedContent.replace(`__ALIGN_BLOCK_${index}__`, block);
|
| 171 |
+
});
|
| 172 |
+
|
| 173 |
+
// Convert math environments in included content
|
| 174 |
+
includedContent = includedContent.replace(/\$\$\\begin\{equation\*\}/g, '$$');
|
| 175 |
+
includedContent = includedContent.replace(/\\end\{equation\*\}\$\$/g, '$$');
|
| 176 |
+
includedContent = includedContent.replace(/\\begin\{equation\*\}/g, '$$');
|
| 177 |
+
includedContent = includedContent.replace(/\\end\{equation\*\}/g, '$$');
|
| 178 |
+
|
| 179 |
+
// Convert citations in included content
|
| 180 |
+
includedContent = includedContent.replace(/\\cite[tp]?\{([^}]+)\}/g, (match, citations) => {
|
| 181 |
+
return citations.split(',').map(cite => `@${cite.trim()}`).join(', ');
|
| 182 |
+
});
|
| 183 |
+
|
| 184 |
+
content = content.replace(`\\input{${inputPath}}`, includedContent);
|
| 185 |
+
} else {
|
| 186 |
+
console.log(` ⚠️ File not found: ${fullPath} (skipping)`);
|
| 187 |
+
content = content.replace(`\\input{${inputPath}}`, `% File not found: ${inputPath}`);
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// Apply reference preprocessing AFTER input inclusion to ensure all references are captured
|
| 192 |
+
console.log('🔧 Preprocessing LaTeX references for MDX compatibility...');
|
| 193 |
+
const referenceResult = preprocessLatexReferences(content);
|
| 194 |
+
content = referenceResult.content;
|
| 195 |
+
|
| 196 |
+
// Write the preprocessed file
|
| 197 |
+
writeFileSync(tempFile, content);
|
| 198 |
+
return tempFile;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
function processBibliography(inputFile, outputDir) {
|
| 202 |
+
const bibFile = join(dirname(inputFile), 'main.bib');
|
| 203 |
+
const outputBibFile = join(outputDir, 'main.bib');
|
| 204 |
+
|
| 205 |
+
if (!existsSync(bibFile)) {
|
| 206 |
+
console.log(' ⚠️ No bibliography file found');
|
| 207 |
+
return null;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
const success = cleanBibliography(bibFile, outputBibFile);
|
| 211 |
+
return success ? outputBibFile : null;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
export function convertLatexToMarkdown(inputFile, outputDir) {
|
| 215 |
+
console.log('🚀 Simple LaTeX to Markdown Converter');
|
| 216 |
+
console.log(`📁 Input: ${inputFile}`);
|
| 217 |
+
console.log(`📁 Output: ${outputDir}`);
|
| 218 |
+
|
| 219 |
+
// Check if input file exists
|
| 220 |
+
if (!existsSync(inputFile)) {
|
| 221 |
+
console.error(`❌ Input file not found: ${inputFile}`);
|
| 222 |
+
process.exit(1);
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
// Ensure output directory exists
|
| 226 |
+
ensureDirectory(outputDir);
|
| 227 |
+
|
| 228 |
+
try {
|
| 229 |
+
// Check if pandoc is available
|
| 230 |
+
execSync('pandoc --version', { stdio: 'pipe' });
|
| 231 |
+
} catch (error) {
|
| 232 |
+
console.error('❌ Pandoc not found. Please install it: brew install pandoc');
|
| 233 |
+
process.exit(1);
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
// Clean and copy bibliography
|
| 237 |
+
const cleanBibFile = processBibliography(inputFile, outputDir);
|
| 238 |
+
|
| 239 |
+
// Preprocess the LaTeX file to resolve \input commands
|
| 240 |
+
const preprocessedFile = preprocessLatexFile(inputFile, outputDir);
|
| 241 |
+
|
| 242 |
+
const inputFileName = basename(inputFile, '.tex');
|
| 243 |
+
const outputFile = join(outputDir, `${inputFileName}.md`);
|
| 244 |
+
|
| 245 |
+
try {
|
| 246 |
+
console.log('📄 Converting with Pandoc...');
|
| 247 |
+
|
| 248 |
+
// Enhanced pandoc conversion - use tex_math_dollars for KaTeX compatibility
|
| 249 |
+
const bibOption = cleanBibFile ? `--bibliography="${cleanBibFile}"` : '';
|
| 250 |
+
|
| 251 |
+
// Use gfm+tex_math_dollars for simple $ delimiters compatible with KaTeX
|
| 252 |
+
const mediaDir = join(outputDir, 'assets', 'image');
|
| 253 |
+
ensureDirectory(mediaDir);
|
| 254 |
+
const inputDir = dirname(inputFile);
|
| 255 |
+
const equationFilterPath = join(__dirname, 'filters', 'equation-ids.lua');
|
| 256 |
+
const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars+raw_html --shift-heading-level-by=1 --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" --lua-filter="${equationFilterPath}" -o "${outputFile}"`;
|
| 257 |
+
|
| 258 |
+
console.log(` Running: ${pandocCommand}`);
|
| 259 |
+
execSync(pandocCommand, { stdio: 'pipe' });
|
| 260 |
+
|
| 261 |
+
// Clean up temp file
|
| 262 |
+
execSync(`rm "${preprocessedFile}"`, { stdio: 'pipe' });
|
| 263 |
+
|
| 264 |
+
// Post-processing to fix KaTeX incompatible constructions
|
| 265 |
+
let markdownContent = readFileSync(outputFile, 'utf8');
|
| 266 |
+
|
| 267 |
+
// Use modular post-processor with code injection
|
| 268 |
+
markdownContent = postProcessMarkdown(markdownContent, inputDir);
|
| 269 |
+
|
| 270 |
+
writeFileSync(outputFile, markdownContent);
|
| 271 |
+
|
| 272 |
+
console.log(`✅ Conversion completed: ${outputFile}`);
|
| 273 |
+
|
| 274 |
+
// Show file size
|
| 275 |
+
const stats = execSync(`wc -l "${outputFile}"`, { encoding: 'utf8' });
|
| 276 |
+
const lines = stats.trim().split(' ')[0];
|
| 277 |
+
console.log(`📊 Result: ${lines} lines written`);
|
| 278 |
+
|
| 279 |
+
} catch (error) {
|
| 280 |
+
console.error('❌ Pandoc conversion failed:');
|
| 281 |
+
console.error(error.message);
|
| 282 |
+
// Clean up temp file on error
|
| 283 |
+
try {
|
| 284 |
+
execSync(`rm "${preprocessedFile}"`, { stdio: 'pipe' });
|
| 285 |
+
} catch { }
|
| 286 |
+
process.exit(1);
|
| 287 |
+
}
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
function main() {
|
| 291 |
+
const config = parseArgs();
|
| 292 |
+
|
| 293 |
+
if (config.clean) {
|
| 294 |
+
console.log('🧹 Cleaning output directory...');
|
| 295 |
+
cleanDirectory(config.output);
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
convertLatexToMarkdown(config.input, config.output);
|
| 299 |
+
|
| 300 |
+
console.log('🎉 Simple conversion completed!');
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
// Show help if requested
|
| 304 |
+
if (process.argv.includes('--help') || process.argv.includes('-h')) {
|
| 305 |
+
console.log(`
|
| 306 |
+
🚀 Simple LaTeX to Markdown Converter
|
| 307 |
+
|
| 308 |
+
Usage:
|
| 309 |
+
node scripts/simple-latex-to-markdown.mjs [options]
|
| 310 |
+
|
| 311 |
+
Options:
|
| 312 |
+
--input=PATH Input LaTeX file (default: latex-converter/input-example/main.tex)
|
| 313 |
+
--output=PATH Output directory (default: output/)
|
| 314 |
+
--clean Clean output directory before conversion
|
| 315 |
+
--help, -h Show this help
|
| 316 |
+
|
| 317 |
+
Examples:
|
| 318 |
+
# Basic conversion
|
| 319 |
+
node scripts/simple-latex-to-markdown.mjs
|
| 320 |
+
|
| 321 |
+
# Custom paths
|
| 322 |
+
node scripts/simple-latex-to-markdown.mjs --input=my-paper.tex --output=converted/
|
| 323 |
+
|
| 324 |
+
# Clean output first
|
| 325 |
+
node scripts/simple-latex-to-markdown.mjs --clean
|
| 326 |
+
`);
|
| 327 |
+
process.exit(0);
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
main();
|
app/scripts/latex-importer/mdx-converter.mjs
ADDED
|
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
| 4 |
+
import { join, dirname, basename, extname } from 'path';
|
| 5 |
+
import { fileURLToPath } from 'url';
|
| 6 |
+
import { extractAndGenerateFrontmatter } from './metadata-extractor.mjs';
|
| 7 |
+
|
| 8 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 9 |
+
const __dirname = dirname(__filename);
|
| 10 |
+
|
| 11 |
+
// Configuration
|
| 12 |
+
const DEFAULT_INPUT = join(__dirname, 'output', 'main.md');
|
| 13 |
+
const DEFAULT_OUTPUT = join(__dirname, 'output', 'main.mdx');
|
| 14 |
+
|
| 15 |
+
function parseArgs() {
|
| 16 |
+
const args = process.argv.slice(2);
|
| 17 |
+
const config = {
|
| 18 |
+
input: DEFAULT_INPUT,
|
| 19 |
+
output: DEFAULT_OUTPUT,
|
| 20 |
+
};
|
| 21 |
+
|
| 22 |
+
for (const arg of args) {
|
| 23 |
+
if (arg.startsWith('--input=')) {
|
| 24 |
+
config.input = arg.substring('--input='.length);
|
| 25 |
+
} else if (arg.startsWith('--output=')) {
|
| 26 |
+
config.output = arg.substring('--output='.length);
|
| 27 |
+
} else if (arg === '--help' || arg === '-h') {
|
| 28 |
+
console.log(`
|
| 29 |
+
📝 Markdown to MDX Converter
|
| 30 |
+
|
| 31 |
+
Usage:
|
| 32 |
+
node mdx-converter.mjs [options]
|
| 33 |
+
|
| 34 |
+
Options:
|
| 35 |
+
--input=PATH Input Markdown file (default: ${DEFAULT_INPUT})
|
| 36 |
+
--output=PATH Output MDX file (default: ${DEFAULT_OUTPUT})
|
| 37 |
+
--help, -h Show this help
|
| 38 |
+
|
| 39 |
+
Examples:
|
| 40 |
+
# Basic conversion
|
| 41 |
+
node mdx-converter.mjs
|
| 42 |
+
|
| 43 |
+
# Custom paths
|
| 44 |
+
node mdx-converter.mjs --input=article.md --output=article.mdx
|
| 45 |
+
`);
|
| 46 |
+
process.exit(0);
|
| 47 |
+
} else if (!config.input) {
|
| 48 |
+
config.input = arg;
|
| 49 |
+
} else if (!config.output) {
|
| 50 |
+
config.output = arg;
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
return config;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
/**
|
| 57 |
+
* Modular MDX post-processing functions for Astro compatibility
|
| 58 |
+
* Each function handles a specific type of transformation
|
| 59 |
+
*/
|
| 60 |
+
|
| 61 |
+
/**
|
| 62 |
+
* Track which Astro components are used during transformations
|
| 63 |
+
*/
|
| 64 |
+
const usedComponents = new Set();
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
* Track individual image imports needed
|
| 68 |
+
*/
|
| 69 |
+
const imageImports = new Map(); // src -> varName
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Add required component imports to the frontmatter
|
| 73 |
+
* @param {string} content - MDX content
|
| 74 |
+
* @returns {string} - Content with component imports
|
| 75 |
+
*/
|
| 76 |
+
/**
|
| 77 |
+
* Generate a variable name from image path
|
| 78 |
+
* @param {string} src - Image source path
|
| 79 |
+
* @returns {string} - Valid variable name
|
| 80 |
+
*/
|
| 81 |
+
function generateImageVarName(src) {
|
| 82 |
+
// Extract filename without extension and make it a valid JS variable
|
| 83 |
+
const filename = src.split('/').pop().replace(/\.[^.]+$/, '');
|
| 84 |
+
return filename.replace(/[^a-zA-Z0-9]/g, '_').replace(/^[0-9]/, 'img_$&');
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
function addComponentImports(content) {
|
| 88 |
+
console.log(' 📦 Adding component and image imports...');
|
| 89 |
+
|
| 90 |
+
let imports = [];
|
| 91 |
+
|
| 92 |
+
// Add component imports
|
| 93 |
+
if (usedComponents.size > 0) {
|
| 94 |
+
const componentImports = Array.from(usedComponents)
|
| 95 |
+
.map(component => `import ${component} from '../components/${component}.astro';`);
|
| 96 |
+
imports.push(...componentImports);
|
| 97 |
+
console.log(` ✅ Importing components: ${Array.from(usedComponents).join(', ')}`);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
// Add image imports
|
| 101 |
+
if (imageImports.size > 0) {
|
| 102 |
+
const imageImportStatements = Array.from(imageImports.entries())
|
| 103 |
+
.map(([src, varName]) => `import ${varName} from '${src}';`);
|
| 104 |
+
imports.push(...imageImportStatements);
|
| 105 |
+
console.log(` ✅ Importing ${imageImports.size} image(s)`);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
if (imports.length === 0) {
|
| 109 |
+
console.log(' ℹ️ No imports needed');
|
| 110 |
+
return content;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
const importBlock = imports.join('\n');
|
| 114 |
+
|
| 115 |
+
// Insert imports after frontmatter
|
| 116 |
+
const frontmatterEnd = content.indexOf('---', 3) + 3;
|
| 117 |
+
if (frontmatterEnd > 2) {
|
| 118 |
+
return content.slice(0, frontmatterEnd) + '\n\n' + importBlock + '\n' + content.slice(frontmatterEnd);
|
| 119 |
+
} else {
|
| 120 |
+
// No frontmatter, add at beginning
|
| 121 |
+
return importBlock + '\n\n' + content;
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
/**
|
| 127 |
+
* Convert grouped figures (subfigures) to MultiFigure components
|
| 128 |
+
* @param {string} content - MDX content
|
| 129 |
+
* @returns {string} - Content with MultiFigure components for grouped figures
|
| 130 |
+
*/
|
| 131 |
+
function convertSubfiguresToMultiFigure(content) {
|
| 132 |
+
console.log(' 🖼️✨ Converting subfigures to MultiFigure components...');
|
| 133 |
+
|
| 134 |
+
let convertedCount = 0;
|
| 135 |
+
|
| 136 |
+
// Pattern to match: <figure> containing multiple <figure> elements with a global caption
|
| 137 |
+
// This matches the LaTeX subfigure pattern that gets converted by Pandoc
|
| 138 |
+
const subfigureGroupPattern = /<figure>\s*((?:<figure>[\s\S]*?<\/figure>\s*){2,})<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/g;
|
| 139 |
+
|
| 140 |
+
const convertedContent = content.replace(subfigureGroupPattern, (match, figuresMatch, globalCaption) => {
|
| 141 |
+
convertedCount++;
|
| 142 |
+
|
| 143 |
+
// Extract individual figures within the group
|
| 144 |
+
// This pattern is more flexible to handle variations in HTML structure
|
| 145 |
+
const individualFigurePattern = /<figure>\s*<img src="([^"]*)"[^>]*\/>\s*<p><span id="([^"]*)"[^&]*><\/span><\/p>\s*<figcaption>([\s\S]*?)<\/figcaption>\s*<\/figure>/g;
|
| 146 |
+
|
| 147 |
+
const images = [];
|
| 148 |
+
let figureMatch;
|
| 149 |
+
|
| 150 |
+
while ((figureMatch = individualFigurePattern.exec(figuresMatch)) !== null) {
|
| 151 |
+
const [, src, id, caption] = figureMatch;
|
| 152 |
+
|
| 153 |
+
// Clean the source path (similar to existing transformImages function)
|
| 154 |
+
const cleanSrc = src.replace(/.*\/output\/assets\//, './assets/')
|
| 155 |
+
.replace(/\/Users\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/app\/scripts\/latex-to-markdown\/output\/assets\//, './assets/');
|
| 156 |
+
|
| 157 |
+
// Clean caption text (remove HTML, normalize whitespace)
|
| 158 |
+
const cleanCaption = caption
|
| 159 |
+
.replace(/<[^>]*>/g, '')
|
| 160 |
+
.replace(/\n/g, ' ')
|
| 161 |
+
.replace(/\s+/g, ' ')
|
| 162 |
+
.replace(/'/g, "\\'")
|
| 163 |
+
.trim();
|
| 164 |
+
|
| 165 |
+
// Generate alt text from caption
|
| 166 |
+
const altText = cleanCaption.length > 100
|
| 167 |
+
? cleanCaption.substring(0, 100) + '...'
|
| 168 |
+
: cleanCaption;
|
| 169 |
+
|
| 170 |
+
// Generate variable name for import
|
| 171 |
+
const varName = generateImageVarName(cleanSrc);
|
| 172 |
+
imageImports.set(cleanSrc, varName);
|
| 173 |
+
|
| 174 |
+
images.push({
|
| 175 |
+
src: varName,
|
| 176 |
+
alt: altText,
|
| 177 |
+
caption: cleanCaption,
|
| 178 |
+
id: id
|
| 179 |
+
});
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
// Clean global caption
|
| 183 |
+
const cleanGlobalCaption = globalCaption
|
| 184 |
+
.replace(/<[^>]*>/g, '')
|
| 185 |
+
.replace(/\n/g, ' ')
|
| 186 |
+
.replace(/\s+/g, ' ')
|
| 187 |
+
.replace(/'/g, "\\'")
|
| 188 |
+
.trim();
|
| 189 |
+
|
| 190 |
+
// Mark MultiFigure component as used
|
| 191 |
+
usedComponents.add('MultiFigure');
|
| 192 |
+
|
| 193 |
+
// Determine layout based on number of images
|
| 194 |
+
let layout = 'auto';
|
| 195 |
+
if (images.length === 2) layout = '2-column';
|
| 196 |
+
else if (images.length === 3) layout = '3-column';
|
| 197 |
+
else if (images.length === 4) layout = '4-column';
|
| 198 |
+
|
| 199 |
+
// Generate MultiFigure component
|
| 200 |
+
const imagesJson = images.map(img =>
|
| 201 |
+
` {\n src: ${img.src},\n alt: "${img.alt}",\n caption: "${img.caption}",\n id: "${img.id}"\n }`
|
| 202 |
+
).join(',\n');
|
| 203 |
+
|
| 204 |
+
return `<MultiFigure
|
| 205 |
+
images={[
|
| 206 |
+
${imagesJson}
|
| 207 |
+
]}
|
| 208 |
+
layout="${layout}"
|
| 209 |
+
zoomable
|
| 210 |
+
downloadable
|
| 211 |
+
caption="${cleanGlobalCaption}"
|
| 212 |
+
/>`;
|
| 213 |
+
});
|
| 214 |
+
|
| 215 |
+
if (convertedCount > 0) {
|
| 216 |
+
console.log(` ✅ Converted ${convertedCount} subfigure group(s) to MultiFigure component(s)`);
|
| 217 |
+
} else {
|
| 218 |
+
console.log(' ℹ️ No subfigure groups found');
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
return convertedContent;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
/**
|
| 225 |
+
* Transform images to Figure components
|
| 226 |
+
* @param {string} content - MDX content
|
| 227 |
+
* @returns {string} - Content with Figure components
|
| 228 |
+
*/
|
| 229 |
+
/**
|
| 230 |
+
* Create Figure component with import
|
| 231 |
+
* @param {string} src - Clean image source
|
| 232 |
+
* @param {string} alt - Alt text
|
| 233 |
+
* @param {string} id - Element ID
|
| 234 |
+
* @param {string} caption - Figure caption
|
| 235 |
+
* @param {string} width - Optional width
|
| 236 |
+
* @returns {string} - Figure component markup
|
| 237 |
+
*/
|
| 238 |
+
function createFigureComponent(src, alt = '', id = '', caption = '', width = '') {
|
| 239 |
+
const varName = generateImageVarName(src);
|
| 240 |
+
imageImports.set(src, varName);
|
| 241 |
+
usedComponents.add('Figure');
|
| 242 |
+
|
| 243 |
+
const props = [];
|
| 244 |
+
props.push(`src={${varName}}`);
|
| 245 |
+
props.push('zoomable');
|
| 246 |
+
props.push('downloadable');
|
| 247 |
+
if (id) props.push(`id="${id}"`);
|
| 248 |
+
props.push('layout="fixed"');
|
| 249 |
+
if (alt) props.push(`alt="${alt}"`);
|
| 250 |
+
if (caption) props.push(`caption={'${caption}'}`);
|
| 251 |
+
|
| 252 |
+
return `<Figure\n ${props.join('\n ')}\n/>`;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
function transformImages(content) {
|
| 256 |
+
console.log(' 🖼️ Transforming images to Figure components with imports...');
|
| 257 |
+
|
| 258 |
+
let hasImages = false;
|
| 259 |
+
|
| 260 |
+
// Helper function to clean source paths
|
| 261 |
+
const cleanSrcPath = (src) => {
|
| 262 |
+
return src.replace(/.*\/output\/assets\//, './assets/')
|
| 263 |
+
.replace(/\/Users\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/[^\/]+\/app\/scripts\/latex-to-markdown\/output\/assets\//, './assets/');
|
| 264 |
+
};
|
| 265 |
+
|
| 266 |
+
// Helper to clean caption text
|
| 267 |
+
const cleanCaption = (caption) => {
|
| 268 |
+
return caption
|
| 269 |
+
.replace(/<[^>]*>/g, '') // Remove HTML tags
|
| 270 |
+
.replace(/\n/g, ' ') // Replace newlines with spaces
|
| 271 |
+
.replace(/\r/g, ' ') // Replace carriage returns with spaces
|
| 272 |
+
.replace(/\s+/g, ' ') // Replace multiple spaces with single space
|
| 273 |
+
.replace(/'/g, "\\'") // Escape quotes
|
| 274 |
+
.trim(); // Trim whitespace
|
| 275 |
+
};
|
| 276 |
+
|
| 277 |
+
// Helper to clean alt text
|
| 278 |
+
const cleanAltText = (alt, maxLength = 100) => {
|
| 279 |
+
const cleaned = alt
|
| 280 |
+
.replace(/<[^>]*>/g, '') // Remove HTML tags
|
| 281 |
+
.replace(/\n/g, ' ') // Replace newlines with spaces
|
| 282 |
+
.replace(/\r/g, ' ') // Replace carriage returns with spaces
|
| 283 |
+
.replace(/\s+/g, ' ') // Replace multiple spaces with single space
|
| 284 |
+
.trim(); // Trim whitespace
|
| 285 |
+
|
| 286 |
+
return cleaned.length > maxLength
|
| 287 |
+
? cleaned.substring(0, maxLength) + '...'
|
| 288 |
+
: cleaned;
|
| 289 |
+
};
|
| 290 |
+
|
| 291 |
+
// 1. Transform complex HTML figures with style attributes
|
| 292 |
+
content = content.replace(
|
| 293 |
+
/<figure id="([^"]*)">\s*<img src="([^"]*)"(?:\s+style="([^"]*)")?\s*\/>\s*<figcaption>\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs,
|
| 294 |
+
(match, id, src, style, caption) => {
|
| 295 |
+
const cleanSrc = cleanSrcPath(src);
|
| 296 |
+
const cleanCap = cleanCaption(caption);
|
| 297 |
+
const altText = cleanAltText(cleanCap);
|
| 298 |
+
hasImages = true;
|
| 299 |
+
|
| 300 |
+
return createFigureComponent(cleanSrc, altText, id, cleanCap);
|
| 301 |
+
}
|
| 302 |
+
);
|
| 303 |
+
|
| 304 |
+
// 2. Transform standalone img tags with style
|
| 305 |
+
content = content.replace(
|
| 306 |
+
/<img src="([^"]*)"(?:\s+style="([^"]*)")?\s*(?:alt="([^"]*)")?\s*\/>/g,
|
| 307 |
+
(match, src, style, alt) => {
|
| 308 |
+
const cleanSrc = cleanSrcPath(src);
|
| 309 |
+
const cleanAlt = cleanAltText(alt || 'Figure');
|
| 310 |
+
hasImages = true;
|
| 311 |
+
|
| 312 |
+
return createFigureComponent(cleanSrc, cleanAlt);
|
| 313 |
+
}
|
| 314 |
+
);
|
| 315 |
+
|
| 316 |
+
// 3. Transform images within wrapfigure divs
|
| 317 |
+
content = content.replace(
|
| 318 |
+
/<div class="wrapfigure">\s*r[\d.]+\s*<img src="([^"]*)"[^>]*\/>\s*<\/div>/gs,
|
| 319 |
+
(match, src) => {
|
| 320 |
+
const cleanSrc = cleanSrcPath(src);
|
| 321 |
+
hasImages = true;
|
| 322 |
+
|
| 323 |
+
return createFigureComponent(cleanSrc, 'Figure');
|
| 324 |
+
}
|
| 325 |
+
);
|
| 326 |
+
|
| 327 |
+
// 4. Transform simple HTML figure/img without style
|
| 328 |
+
content = content.replace(
|
| 329 |
+
/<figure id="([^"]*)">\s*<img src="([^"]*)" \/>\s*<figcaption>\s*(.*?)\s*<\/figcaption>\s*<\/figure>/gs,
|
| 330 |
+
(match, id, src, caption) => {
|
| 331 |
+
const cleanSrc = cleanSrcPath(src);
|
| 332 |
+
const cleanCap = cleanCaption(caption);
|
| 333 |
+
const altText = cleanAltText(cleanCap);
|
| 334 |
+
hasImages = true;
|
| 335 |
+
|
| 336 |
+
return createFigureComponent(cleanSrc, altText, id, cleanCap);
|
| 337 |
+
}
|
| 338 |
+
);
|
| 339 |
+
|
| 340 |
+
// 5. Clean up figures with minipage divs
|
| 341 |
+
content = content.replace(
|
| 342 |
+
/<figure id="([^"]*)">\s*<div class="minipage">\s*<img src="([^"]*)"[^>]*\/>\s*<\/div>\s*<figcaption[^>]*>(.*?)<\/figcaption>\s*<\/figure>/gs,
|
| 343 |
+
(match, id, src, caption) => {
|
| 344 |
+
const cleanSrc = cleanSrcPath(src);
|
| 345 |
+
const cleanCap = cleanCaption(caption);
|
| 346 |
+
const altText = cleanAltText(cleanCap);
|
| 347 |
+
hasImages = true;
|
| 348 |
+
|
| 349 |
+
return createFigureComponent(cleanSrc, altText, id, cleanCap);
|
| 350 |
+
}
|
| 351 |
+
);
|
| 352 |
+
|
| 353 |
+
// 6. Transform Pandoc-style images: {#id attr="value"}
|
| 354 |
+
content = content.replace(
|
| 355 |
+
/!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g,
|
| 356 |
+
(match, alt, src, attributes) => {
|
| 357 |
+
const cleanSrc = cleanSrcPath(src);
|
| 358 |
+
const cleanAlt = cleanAltText(alt || 'Figure');
|
| 359 |
+
hasImages = true;
|
| 360 |
+
|
| 361 |
+
let id = '';
|
| 362 |
+
if (attributes) {
|
| 363 |
+
const idMatch = attributes.match(/#([\w-]+)/);
|
| 364 |
+
if (idMatch) id = idMatch[1];
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
return createFigureComponent(cleanSrc, cleanAlt, id);
|
| 368 |
+
}
|
| 369 |
+
);
|
| 370 |
+
|
| 371 |
+
if (hasImages) {
|
| 372 |
+
console.log(' ✅ Figure components with imports will be created');
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
return content;
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
/**
|
| 379 |
+
* Transform HTML spans with style attributes to appropriate components
|
| 380 |
+
* @param {string} content - MDX content
|
| 381 |
+
* @returns {string} - Content with transformed spans
|
| 382 |
+
*/
|
| 383 |
+
function transformStyledSpans(content) {
|
| 384 |
+
console.log(' 🎨 Transforming styled spans...');
|
| 385 |
+
|
| 386 |
+
// Transform HTML spans with style attributes
|
| 387 |
+
content = content.replace(
|
| 388 |
+
/<span style="color: ([^"]+)">(.*?)<\/span>/g,
|
| 389 |
+
(match, color, text) => {
|
| 390 |
+
// Map colors to semantic classes or components
|
| 391 |
+
const colorMap = {
|
| 392 |
+
'hf2': 'text-hf-secondary',
|
| 393 |
+
'hf1': 'text-hf-primary'
|
| 394 |
+
};
|
| 395 |
+
|
| 396 |
+
const className = colorMap[color] || `text-${color}`;
|
| 397 |
+
return `<span class="${className}">${text}</span>`;
|
| 398 |
+
}
|
| 399 |
+
);
|
| 400 |
+
|
| 401 |
+
// Transform markdown spans with style attributes: [text]{style="color: color"}
|
| 402 |
+
content = content.replace(
|
| 403 |
+
/\[([^\]]+)\]\{style="color: ([^"]+)"\}/g,
|
| 404 |
+
(match, text, color) => {
|
| 405 |
+
// Map colors to semantic classes or components
|
| 406 |
+
const colorMap = {
|
| 407 |
+
'hf2': 'text-hf-secondary',
|
| 408 |
+
'hf1': 'text-hf-primary'
|
| 409 |
+
};
|
| 410 |
+
|
| 411 |
+
const className = colorMap[color] || `text-${color}`;
|
| 412 |
+
return `<span class="${className}">${text}</span>`;
|
| 413 |
+
}
|
| 414 |
+
);
|
| 415 |
+
|
| 416 |
+
return content;
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
/**
|
| 420 |
+
* Transform reference links to proper Astro internal links
|
| 421 |
+
* @param {string} content - MDX content
|
| 422 |
+
* @returns {string} - Content with transformed links
|
| 423 |
+
*/
|
| 424 |
+
function fixHtmlEscaping(content) {
|
| 425 |
+
console.log(' 🔧 Fixing HTML escaping in spans...');
|
| 426 |
+
|
| 427 |
+
let fixedCount = 0;
|
| 428 |
+
|
| 429 |
+
// Pattern 1: \<span id="..." style="..."\>\</span\>
|
| 430 |
+
content = content.replace(/\\<span id="([^"]*)" style="([^"]*)"\\>\\<\/span\\>/g, (match, id, style) => {
|
| 431 |
+
fixedCount++;
|
| 432 |
+
// Fix common style issues like "position- absolute;" -> "position: absolute;"
|
| 433 |
+
const cleanStyle = style.replace('position- absolute;', 'position: absolute;');
|
| 434 |
+
return `<span id="${id}" style="${cleanStyle}"></span>`;
|
| 435 |
+
});
|
| 436 |
+
|
| 437 |
+
// Pattern 2: \<span class="..."\>...\</span\>
|
| 438 |
+
content = content.replace(/\\<span class="([^"]*)"\\>([^\\]+)\\<\/span\\>/g, (match, className, text) => {
|
| 439 |
+
fixedCount++;
|
| 440 |
+
// Remove numbering like (1), (2), (3) from highlight spans
|
| 441 |
+
let cleanText = text;
|
| 442 |
+
if (className === 'highlight') {
|
| 443 |
+
cleanText = text.replace(/^\(\d+\)\s*/, '');
|
| 444 |
+
}
|
| 445 |
+
return `<span class="${className}">${cleanText}</span>`;
|
| 446 |
+
});
|
| 447 |
+
|
| 448 |
+
// Pattern 3: HTML-encoded spans in paragraph tags
|
| 449 |
+
// <p><span id="..." style="..."></span></p>
|
| 450 |
+
content = content.replace(/<p><span id="([^"]*)" style="([^"]*)"><\/span><\/p>/g, (match, id, style) => {
|
| 451 |
+
fixedCount++;
|
| 452 |
+
// Fix common style issues like "position- absolute;" -> "position: absolute;"
|
| 453 |
+
const cleanStyle = style.replace('position- absolute;', 'position: absolute;');
|
| 454 |
+
return `<span id="${id}" style="${cleanStyle}"></span>`;
|
| 455 |
+
});
|
| 456 |
+
|
| 457 |
+
// Pattern 4: HTML-encoded spans with class in paragraph tags
|
| 458 |
+
// <p><span class="...">...</span></p>
|
| 459 |
+
content = content.replace(/<p><span class="([^"]*)">([^&]*)<\/span><\/p>/g, (match, className, text) => {
|
| 460 |
+
fixedCount++;
|
| 461 |
+
// Remove numbering like (1), (2), (3) from highlight spans
|
| 462 |
+
let cleanText = text;
|
| 463 |
+
if (className === 'highlight') {
|
| 464 |
+
cleanText = text.replace(/^\(\d+\)\s*/, '');
|
| 465 |
+
}
|
| 466 |
+
return `<span class="${className}">${cleanText}</span>`;
|
| 467 |
+
});
|
| 468 |
+
|
| 469 |
+
if (fixedCount > 0) {
|
| 470 |
+
console.log(` ✅ Fixed ${fixedCount} escaped span(s)`);
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
return content;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
function cleanHighlightNumbering(content) {
|
| 477 |
+
console.log(' 🔢 Removing numbering from highlight spans...');
|
| 478 |
+
|
| 479 |
+
let cleanedCount = 0;
|
| 480 |
+
// Clean numbering from non-escaped highlight spans too
|
| 481 |
+
content = content.replace(/<span class="highlight">(\(\d+\)\s*)([^<]+)<\/span>/g, (match, numbering, text) => {
|
| 482 |
+
cleanedCount++;
|
| 483 |
+
return `<span class="highlight">${text}</span>`;
|
| 484 |
+
});
|
| 485 |
+
|
| 486 |
+
if (cleanedCount > 0) {
|
| 487 |
+
console.log(` ✅ Removed numbering from ${cleanedCount} highlight span(s)`);
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
return content;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
function transformReferenceLinks(content) {
|
| 494 |
+
console.log(' 🔗 Transforming reference links...');
|
| 495 |
+
|
| 496 |
+
// Transform Pandoc reference links: [text](#ref){reference-type="ref" reference="ref"}
|
| 497 |
+
return content.replace(
|
| 498 |
+
/\[([^\]]+)\]\((#[^)]+)\)\{[^}]*reference[^}]*\}/g,
|
| 499 |
+
(match, text, href) => {
|
| 500 |
+
return `[${text}](${href})`;
|
| 501 |
+
}
|
| 502 |
+
);
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
/**
|
| 507 |
+
* Fix frontmatter and ensure proper MDX format
|
| 508 |
+
* @param {string} content - MDX content
|
| 509 |
+
* @param {string} latexContent - Original LaTeX content for metadata extraction
|
| 510 |
+
* @returns {string} - Content with proper frontmatter
|
| 511 |
+
*/
|
| 512 |
+
function ensureFrontmatter(content, latexContent = '') {
|
| 513 |
+
console.log(' 📄 Ensuring proper frontmatter...');
|
| 514 |
+
|
| 515 |
+
if (!content.startsWith('---')) {
|
| 516 |
+
let frontmatter;
|
| 517 |
+
|
| 518 |
+
if (latexContent) {
|
| 519 |
+
// Extract metadata from LaTeX using dedicated module
|
| 520 |
+
frontmatter = extractAndGenerateFrontmatter(latexContent);
|
| 521 |
+
console.log(' ✅ Generated frontmatter from LaTeX metadata');
|
| 522 |
+
} else {
|
| 523 |
+
// Fallback frontmatter
|
| 524 |
+
const currentDate = new Date().toLocaleDateString('en-US', {
|
| 525 |
+
year: 'numeric',
|
| 526 |
+
month: 'short',
|
| 527 |
+
day: '2-digit'
|
| 528 |
+
});
|
| 529 |
+
frontmatter = `---
|
| 530 |
+
title: "Research Article"
|
| 531 |
+
published: "${currentDate}"
|
| 532 |
+
tableOfContentsAutoCollapse: true
|
| 533 |
+
---
|
| 534 |
+
|
| 535 |
+
`;
|
| 536 |
+
console.log(' ✅ Generated basic frontmatter');
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
return frontmatter + content;
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
return content;
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
/**
|
| 546 |
+
* Fix mixed math delimiters like $`...`$ or `...`$
|
| 547 |
+
* @param {string} content - MDX content
|
| 548 |
+
* @returns {string} - Content with fixed math delimiters
|
| 549 |
+
*/
|
| 550 |
+
function fixMixedMathDelimiters(content) {
|
| 551 |
+
console.log(' 🔧 Fixing mixed math delimiters...');
|
| 552 |
+
|
| 553 |
+
let fixedCount = 0;
|
| 554 |
+
|
| 555 |
+
// Fix patterns like $`...`$ (mixed delimiters)
|
| 556 |
+
content = content.replace(/\$`([^`]*)`\$/g, (match, mathContent) => {
|
| 557 |
+
fixedCount++;
|
| 558 |
+
return `$${mathContent}$`;
|
| 559 |
+
});
|
| 560 |
+
|
| 561 |
+
// Fix patterns like `...`$ (backtick start, dollar end)
|
| 562 |
+
content = content.replace(/`([^`]*)`\$/g, (match, mathContent) => {
|
| 563 |
+
fixedCount++;
|
| 564 |
+
return `$${mathContent}$`;
|
| 565 |
+
});
|
| 566 |
+
|
| 567 |
+
// Fix patterns like $`...` (dollar start, backtick end - less common)
|
| 568 |
+
content = content.replace(/\$`([^`]*)`(?!\$)/g, (match, mathContent) => {
|
| 569 |
+
fixedCount++;
|
| 570 |
+
return `$${mathContent}$`;
|
| 571 |
+
});
|
| 572 |
+
|
| 573 |
+
if (fixedCount > 0) {
|
| 574 |
+
console.log(` ✅ Fixed ${fixedCount} mixed math delimiter(s)`);
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
return content;
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
/**
|
| 581 |
+
* Clean up orphaned math delimiters and fix mixed content
|
| 582 |
+
* @param {string} content - MDX content
|
| 583 |
+
* @returns {string} - Content with cleaned math blocks
|
| 584 |
+
*/
|
| 585 |
+
function cleanOrphanedMathDelimiters(content) {
|
| 586 |
+
console.log(' 🧹 Cleaning orphaned math delimiters...');
|
| 587 |
+
console.log(' 🔍 Content length:', content.length, 'chars');
|
| 588 |
+
|
| 589 |
+
let fixedCount = 0;
|
| 590 |
+
|
| 591 |
+
// Fix orphaned $$ that are alone on lines (but not part of display math blocks)
|
| 592 |
+
// Only remove $$ that appear alone without corresponding closing $$
|
| 593 |
+
content = content.replace(/^\$\$\s*$(?!\s*[\s\S]*?\$\$)/gm, () => {
|
| 594 |
+
fixedCount++;
|
| 595 |
+
return '';
|
| 596 |
+
});
|
| 597 |
+
|
| 598 |
+
// Fix backticks inside $$....$$ blocks (Pandoc artifact)
|
| 599 |
+
const mathMatches = content.match(/\$\$([\s\S]*?)\$\$/g);
|
| 600 |
+
console.log(` 🔍 Found ${mathMatches ? mathMatches.length : 0} math blocks`);
|
| 601 |
+
|
| 602 |
+
content = content.replace(/\$\$([\s\S]*?)\$\$/g, (match, mathContent) => {
|
| 603 |
+
// More aggressive: remove ALL single backticks in math blocks (they shouldn't be there)
|
| 604 |
+
let cleanedMath = mathContent;
|
| 605 |
+
|
| 606 |
+
// Count backticks before
|
| 607 |
+
const backticksBefore = (mathContent.match(/`/g) || []).length;
|
| 608 |
+
|
| 609 |
+
if (backticksBefore > 0) {
|
| 610 |
+
console.log(` 🔧 Found math block with ${backticksBefore} backtick(s)`);
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
// Remove all isolated backticks (not in pairs)
|
| 614 |
+
cleanedMath = cleanedMath.replace(/`/g, '');
|
| 615 |
+
|
| 616 |
+
const backticksAfter = (cleanedMath.match(/`/g) || []).length;
|
| 617 |
+
|
| 618 |
+
if (backticksBefore > 0) {
|
| 619 |
+
fixedCount++;
|
| 620 |
+
console.log(` 🔧 Removed ${backticksBefore} backtick(s) from math block`);
|
| 621 |
+
return `$$${cleanedMath}$$`;
|
| 622 |
+
}
|
| 623 |
+
return match;
|
| 624 |
+
});
|
| 625 |
+
|
| 626 |
+
// Fix escaped align in math blocks: \begin{align} -> \begin{align}
|
| 627 |
+
content = content.replace(/\\begin\{align\}/g, (match) => {
|
| 628 |
+
fixedCount++;
|
| 629 |
+
return '\\begin{align}';
|
| 630 |
+
});
|
| 631 |
+
|
| 632 |
+
content = content.replace(/\\end\{align\}/g, (match) => {
|
| 633 |
+
fixedCount++;
|
| 634 |
+
return '\\end{align}';
|
| 635 |
+
});
|
| 636 |
+
|
| 637 |
+
// Fix cases where text gets mixed with math blocks
|
| 638 |
+
// Pattern: ``` math ... ``` text ``` math
|
| 639 |
+
content = content.replace(/``` math\s*\n([\s\S]*?)\n```\s*([^`\n]*?)\s*``` math/g, (match, math1, text, math2) => {
|
| 640 |
+
if (text.trim().length > 0 && !text.includes('```')) {
|
| 641 |
+
fixedCount++;
|
| 642 |
+
return '```' + ' math\n' + math1 + '\n```\n\n' + text.trim() + '\n\n```' + ' math';
|
| 643 |
+
}
|
| 644 |
+
return match;
|
| 645 |
+
});
|
| 646 |
+
|
| 647 |
+
if (fixedCount > 0) {
|
| 648 |
+
console.log(` ✅ Fixed ${fixedCount} orphaned math delimiter(s)`);
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
return content;
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
/**
|
| 655 |
+
* Clean newlines from single-dollar math blocks ($...$) ONLY
|
| 656 |
+
* @param {string} content - MDX content
|
| 657 |
+
* @returns {string} - Content with cleaned math blocks
|
| 658 |
+
*/
|
| 659 |
+
function cleanSingleLineMathNewlines(content) {
|
| 660 |
+
console.log(' 🔢 Cleaning newlines in single-dollar math blocks ($...$)...');
|
| 661 |
+
|
| 662 |
+
let cleanedCount = 0;
|
| 663 |
+
|
| 664 |
+
// ULTRA STRICT: Only target single dollar blocks ($...$) that contain newlines
|
| 665 |
+
// Use dotall flag (s) to match newlines with .*, and ensure we don't match $$
|
| 666 |
+
const cleanedContent = content.replace(/\$(?!\$)([\s\S]*?)\$(?!\$)/g, (match, mathContent) => {
|
| 667 |
+
// Only process if the content contains newlines
|
| 668 |
+
if (mathContent.includes('\n')) {
|
| 669 |
+
cleanedCount++;
|
| 670 |
+
|
| 671 |
+
// Remove ALL newlines and carriage returns, normalize whitespace
|
| 672 |
+
const cleanedMath = mathContent
|
| 673 |
+
.replace(/\n+/g, ' ') // Replace all newlines with spaces
|
| 674 |
+
.replace(/\r+/g, ' ') // Replace carriage returns with spaces
|
| 675 |
+
.replace(/\s+/g, ' ') // Normalize multiple spaces to single
|
| 676 |
+
.trim(); // Remove leading/trailing spaces
|
| 677 |
+
|
| 678 |
+
return `$${cleanedMath}$`;
|
| 679 |
+
}
|
| 680 |
+
return match; // Keep original if no newlines
|
| 681 |
+
});
|
| 682 |
+
|
| 683 |
+
if (cleanedCount > 0) {
|
| 684 |
+
console.log(` ✅ Cleaned ${cleanedCount} single-dollar math block(s) with newlines`);
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
+
return cleanedContent;
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
/**
|
| 691 |
+
* Add proper line breaks around display math blocks ($$...$$)
|
| 692 |
+
* @param {string} content - MDX content
|
| 693 |
+
* @returns {string} - Content with properly spaced display math
|
| 694 |
+
*/
|
| 695 |
+
function formatDisplayMathBlocks(content) {
|
| 696 |
+
console.log(' 📐 Formatting display math blocks with proper spacing...');
|
| 697 |
+
|
| 698 |
+
let formattedCount = 0;
|
| 699 |
+
|
| 700 |
+
// Find all $$...$$$ blocks (display math) and ensure proper line breaks
|
| 701 |
+
// Very strict: only matches exactly $$ followed by content followed by $$
|
| 702 |
+
const formattedContent = content.replace(/\$\$([\s\S]*?)\$\$/g, (match, mathContent) => {
|
| 703 |
+
formattedCount++;
|
| 704 |
+
|
| 705 |
+
// Clean up the math content - trim whitespace but preserve structure
|
| 706 |
+
const cleanedMath = mathContent.trim();
|
| 707 |
+
|
| 708 |
+
// Return with proper line breaks before and after
|
| 709 |
+
return `\n$$\n${cleanedMath}\n$$\n`;
|
| 710 |
+
});
|
| 711 |
+
|
| 712 |
+
if (formattedCount > 0) {
|
| 713 |
+
console.log(` ✅ Formatted ${formattedCount} display math block(s) with proper spacing`);
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
return formattedContent;
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
/**
|
| 720 |
+
* Clean newlines from figcaption content
|
| 721 |
+
* @param {string} content - MDX content
|
| 722 |
+
* @returns {string} - Content with cleaned figcaptions
|
| 723 |
+
*/
|
| 724 |
+
function cleanFigcaptionNewlines(content) {
|
| 725 |
+
console.log(' 📝 Cleaning newlines in figcaption elements...');
|
| 726 |
+
|
| 727 |
+
let cleanedCount = 0;
|
| 728 |
+
|
| 729 |
+
// Find all <figcaption>...</figcaption> blocks and remove internal newlines
|
| 730 |
+
const cleanedContent = content.replace(/<figcaption([^>]*)>([\s\S]*?)<\/figcaption>/g, (match, attributes, captionContent) => {
|
| 731 |
+
// Only process if the content contains newlines
|
| 732 |
+
if (captionContent.includes('\n')) {
|
| 733 |
+
cleanedCount++;
|
| 734 |
+
|
| 735 |
+
// Remove newlines and normalize whitespace
|
| 736 |
+
const cleanedCaption = captionContent
|
| 737 |
+
.replace(/\n+/g, ' ') // Replace newlines with spaces
|
| 738 |
+
.replace(/\s+/g, ' ') // Normalize multiple spaces
|
| 739 |
+
.trim(); // Trim whitespace
|
| 740 |
+
|
| 741 |
+
return `<figcaption${attributes}>${cleanedCaption}</figcaption>`;
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
return match; // Return unchanged if no newlines
|
| 745 |
+
});
|
| 746 |
+
|
| 747 |
+
if (cleanedCount > 0) {
|
| 748 |
+
console.log(` ✅ Cleaned ${cleanedCount} figcaption element(s)`);
|
| 749 |
+
} else {
|
| 750 |
+
console.log(` ℹ️ No figcaption elements with newlines found`);
|
| 751 |
+
}
|
| 752 |
+
|
| 753 |
+
return cleanedContent;
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
/**
|
| 757 |
+
* Remove HTML comments from MDX content
|
| 758 |
+
* @param {string} content - MDX content
|
| 759 |
+
* @returns {string} - Content without HTML comments
|
| 760 |
+
*/
|
| 761 |
+
function removeHtmlComments(content) {
|
| 762 |
+
console.log(' 🗑️ Removing HTML comments...');
|
| 763 |
+
|
| 764 |
+
let removedCount = 0;
|
| 765 |
+
|
| 766 |
+
// Remove all HTML comments <!-- ... -->
|
| 767 |
+
const cleanedContent = content.replace(/<!--[\s\S]*?-->/g, () => {
|
| 768 |
+
removedCount++;
|
| 769 |
+
return '';
|
| 770 |
+
});
|
| 771 |
+
|
| 772 |
+
if (removedCount > 0) {
|
| 773 |
+
console.log(` ✅ Removed ${removedCount} HTML comment(s)`);
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
return cleanedContent;
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
/**
|
| 780 |
+
* Clean up MDX-incompatible syntax
|
| 781 |
+
* @param {string} content - MDX content
|
| 782 |
+
* @returns {string} - Cleaned content
|
| 783 |
+
*/
|
| 784 |
+
function cleanMdxSyntax(content) {
|
| 785 |
+
console.log(' 🧹 Cleaning MDX syntax...');
|
| 786 |
+
|
| 787 |
+
return content
|
| 788 |
+
// NOTE: Math delimiter fixing is now handled by fixMixedMathDelimiters()
|
| 789 |
+
// Ensure proper spacing around JSX-like constructs
|
| 790 |
+
.replace(/>\s*</g, '>\n<')
|
| 791 |
+
// Remove problematic heading attributes - be more specific to avoid matching \begin{align}
|
| 792 |
+
.replace(/^(#{1,6}\s+[^{#\n]+)\{[^}]+\}$/gm, '$1')
|
| 793 |
+
// Fix escaped quotes in text
|
| 794 |
+
.replace(/\\("|')/g, '$1');
|
| 795 |
+
}
|
| 796 |
+
|
| 797 |
+
/**
|
| 798 |
+
* Main MDX processing function that applies all transformations
|
| 799 |
+
* @param {string} content - Raw Markdown content
|
| 800 |
+
* @param {string} latexContent - Original LaTeX content for metadata extraction
|
| 801 |
+
* @returns {string} - Processed MDX content compatible with Astro
|
| 802 |
+
*/
|
| 803 |
+
function processMdxContent(content, latexContent = '') {
|
| 804 |
+
console.log('🔧 Processing for Astro MDX compatibility...');
|
| 805 |
+
|
| 806 |
+
// Clear previous tracking
|
| 807 |
+
usedComponents.clear();
|
| 808 |
+
imageImports.clear();
|
| 809 |
+
|
| 810 |
+
let processedContent = content;
|
| 811 |
+
|
| 812 |
+
// Apply each transformation step sequentially
|
| 813 |
+
processedContent = ensureFrontmatter(processedContent, latexContent);
|
| 814 |
+
processedContent = fixMixedMathDelimiters(processedContent);
|
| 815 |
+
|
| 816 |
+
// Debug: check for $$ blocks after fixMixedMathDelimiters
|
| 817 |
+
const mathBlocksAfterMixed = (processedContent.match(/\$\$([\s\S]*?)\$\$/g) || []).length;
|
| 818 |
+
console.log(` 📊 Math blocks after mixed delimiters fix: ${mathBlocksAfterMixed}`);
|
| 819 |
+
|
| 820 |
+
processedContent = cleanOrphanedMathDelimiters(processedContent);
|
| 821 |
+
processedContent = cleanSingleLineMathNewlines(processedContent);
|
| 822 |
+
processedContent = formatDisplayMathBlocks(processedContent);
|
| 823 |
+
processedContent = removeHtmlComments(processedContent);
|
| 824 |
+
processedContent = cleanMdxSyntax(processedContent);
|
| 825 |
+
processedContent = convertSubfiguresToMultiFigure(processedContent);
|
| 826 |
+
processedContent = transformImages(processedContent);
|
| 827 |
+
processedContent = transformStyledSpans(processedContent);
|
| 828 |
+
processedContent = transformReferenceLinks(processedContent);
|
| 829 |
+
processedContent = fixHtmlEscaping(processedContent);
|
| 830 |
+
processedContent = cleanHighlightNumbering(processedContent);
|
| 831 |
+
processedContent = cleanFigcaptionNewlines(processedContent);
|
| 832 |
+
|
| 833 |
+
// Add component imports at the end
|
| 834 |
+
processedContent = addComponentImports(processedContent);
|
| 835 |
+
|
| 836 |
+
return processedContent;
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
function convertToMdx(inputFile, outputFile) {
|
| 840 |
+
console.log('📝 Modular Markdown to Astro MDX Converter');
|
| 841 |
+
console.log(`📁 Input: ${inputFile}`);
|
| 842 |
+
console.log(`📁 Output: ${outputFile}`);
|
| 843 |
+
|
| 844 |
+
// Check if input file exists
|
| 845 |
+
if (!existsSync(inputFile)) {
|
| 846 |
+
console.error(`❌ Input file not found: ${inputFile}`);
|
| 847 |
+
process.exit(1);
|
| 848 |
+
}
|
| 849 |
+
|
| 850 |
+
try {
|
| 851 |
+
console.log('🔄 Reading Markdown file...');
|
| 852 |
+
const markdownContent = readFileSync(inputFile, 'utf8');
|
| 853 |
+
|
| 854 |
+
// Try to read original LaTeX file for metadata extraction
|
| 855 |
+
let latexContent = '';
|
| 856 |
+
try {
|
| 857 |
+
const inputDir = dirname(inputFile);
|
| 858 |
+
const latexFile = join(inputDir, '..', 'input', 'main.tex');
|
| 859 |
+
if (existsSync(latexFile)) {
|
| 860 |
+
latexContent = readFileSync(latexFile, 'utf8');
|
| 861 |
+
}
|
| 862 |
+
} catch (error) {
|
| 863 |
+
// Ignore LaTeX reading errors - we'll use fallback frontmatter
|
| 864 |
+
}
|
| 865 |
+
|
| 866 |
+
// Apply modular MDX processing
|
| 867 |
+
const mdxContent = processMdxContent(markdownContent, latexContent);
|
| 868 |
+
|
| 869 |
+
console.log('💾 Writing MDX file...');
|
| 870 |
+
writeFileSync(outputFile, mdxContent);
|
| 871 |
+
|
| 872 |
+
console.log(`✅ Conversion completed: ${outputFile}`);
|
| 873 |
+
|
| 874 |
+
// Show file size
|
| 875 |
+
const inputSize = Math.round(markdownContent.length / 1024);
|
| 876 |
+
const outputSize = Math.round(mdxContent.length / 1024);
|
| 877 |
+
console.log(`📊 Input: ${inputSize}KB → Output: ${outputSize}KB`);
|
| 878 |
+
|
| 879 |
+
} catch (error) {
|
| 880 |
+
console.error('❌ Conversion failed:');
|
| 881 |
+
console.error(error.message);
|
| 882 |
+
process.exit(1);
|
| 883 |
+
}
|
| 884 |
+
}
|
| 885 |
+
|
| 886 |
+
export { convertToMdx };
|
| 887 |
+
|
| 888 |
+
function main() {
|
| 889 |
+
const config = parseArgs();
|
| 890 |
+
convertToMdx(config.input, config.output);
|
| 891 |
+
console.log('🎉 MDX conversion completed!');
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 895 |
+
main();
|
| 896 |
+
}
|
app/scripts/latex-importer/metadata-extractor.mjs
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* LaTeX Metadata Extractor
|
| 3 |
+
* Extracts document metadata from LaTeX files for frontmatter generation
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
/**
|
| 7 |
+
* Extract metadata from LaTeX content
|
| 8 |
+
* @param {string} latexContent - Raw LaTeX content
|
| 9 |
+
* @returns {object} - Extracted metadata object
|
| 10 |
+
*/
|
| 11 |
+
export function extractLatexMetadata(latexContent) {
|
| 12 |
+
const metadata = {};
|
| 13 |
+
|
| 14 |
+
// Extract title
|
| 15 |
+
const titleMatch = latexContent.match(/\\title\s*\{\s*([^}]+)\s*\}/s);
|
| 16 |
+
if (titleMatch) {
|
| 17 |
+
metadata.title = titleMatch[1]
|
| 18 |
+
.replace(/\n/g, ' ')
|
| 19 |
+
.trim();
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
// Extract authors with their specific affiliations
|
| 23 |
+
const authors = [];
|
| 24 |
+
const authorMatches = latexContent.matchAll(/\\authorOne\[[^\]]*\]\{([^}]+)\}/g);
|
| 25 |
+
|
| 26 |
+
for (const match of authorMatches) {
|
| 27 |
+
const fullAuthorInfo = match[1];
|
| 28 |
+
|
| 29 |
+
// Determine affiliations based on macros present
|
| 30 |
+
const affiliations = [];
|
| 31 |
+
if (fullAuthorInfo.includes('\\ensps')) {
|
| 32 |
+
affiliations.push(1); // École Normale Supérieure
|
| 33 |
+
}
|
| 34 |
+
if (fullAuthorInfo.includes('\\hf')) {
|
| 35 |
+
affiliations.push(2); // Hugging Face
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// Clean author name by removing macros
|
| 39 |
+
let authorName = fullAuthorInfo
|
| 40 |
+
.replace(/\\ensps/g, '') // Remove École macro
|
| 41 |
+
.replace(/\\hf/g, '') // Remove Hugging Face macro
|
| 42 |
+
.replace(/\s+/g, ' ') // Normalize whitespace
|
| 43 |
+
.trim();
|
| 44 |
+
|
| 45 |
+
// Skip empty authors or placeholder entries
|
| 46 |
+
if (authorName && authorName !== '...') {
|
| 47 |
+
authors.push({
|
| 48 |
+
name: authorName,
|
| 49 |
+
affiliations: affiliations.length > 0 ? affiliations : [2] // Default to HF if no macro
|
| 50 |
+
});
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
if (authors.length > 0) {
|
| 55 |
+
metadata.authors = authors;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// Extract affiliations - create the two distinct affiliations
|
| 59 |
+
metadata.affiliations = [
|
| 60 |
+
{
|
| 61 |
+
name: "École Normale Supérieure Paris-Saclay"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
name: "Hugging Face"
|
| 65 |
+
}
|
| 66 |
+
];
|
| 67 |
+
|
| 68 |
+
// Extract date if available (common LaTeX patterns)
|
| 69 |
+
const datePatterns = [
|
| 70 |
+
/\\date\s*\{([^}]+)\}/,
|
| 71 |
+
/\\newcommand\s*\{\\date\}\s*\{([^}]+)\}/,
|
| 72 |
+
];
|
| 73 |
+
|
| 74 |
+
for (const pattern of datePatterns) {
|
| 75 |
+
const dateMatch = latexContent.match(pattern);
|
| 76 |
+
if (dateMatch) {
|
| 77 |
+
metadata.published = dateMatch[1].trim();
|
| 78 |
+
break;
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// Fallback to current date if no date found
|
| 83 |
+
if (!metadata.published) {
|
| 84 |
+
metadata.published = new Date().toLocaleDateString('en-US', {
|
| 85 |
+
year: 'numeric',
|
| 86 |
+
month: 'short',
|
| 87 |
+
day: '2-digit'
|
| 88 |
+
});
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
return metadata;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* Generate YAML frontmatter from metadata object
|
| 96 |
+
* @param {object} metadata - Metadata object
|
| 97 |
+
* @returns {string} - YAML frontmatter string
|
| 98 |
+
*/
|
| 99 |
+
export function generateFrontmatter(metadata) {
|
| 100 |
+
let frontmatter = '---\n';
|
| 101 |
+
|
| 102 |
+
// Title
|
| 103 |
+
if (metadata.title) {
|
| 104 |
+
frontmatter += `title: "${metadata.title}"\n`;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
// Authors
|
| 108 |
+
if (metadata.authors && metadata.authors.length > 0) {
|
| 109 |
+
frontmatter += 'authors:\n';
|
| 110 |
+
metadata.authors.forEach(author => {
|
| 111 |
+
frontmatter += ` - name: "${author.name}"\n`;
|
| 112 |
+
if (author.url) {
|
| 113 |
+
frontmatter += ` url: "${author.url}"\n`;
|
| 114 |
+
}
|
| 115 |
+
frontmatter += ` affiliations: [${author.affiliations.join(', ')}]\n`;
|
| 116 |
+
});
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// Affiliations
|
| 120 |
+
if (metadata.affiliations && metadata.affiliations.length > 0) {
|
| 121 |
+
frontmatter += 'affiliations:\n';
|
| 122 |
+
metadata.affiliations.forEach((affiliation, index) => {
|
| 123 |
+
frontmatter += ` - name: "${affiliation.name}"\n`;
|
| 124 |
+
if (affiliation.url) {
|
| 125 |
+
frontmatter += ` url: "${affiliation.url}"\n`;
|
| 126 |
+
}
|
| 127 |
+
});
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// Publication date
|
| 131 |
+
if (metadata.published) {
|
| 132 |
+
frontmatter += `published: "${metadata.published}"\n`;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
// Additional metadata
|
| 136 |
+
if (metadata.doi) {
|
| 137 |
+
frontmatter += `doi: "${metadata.doi}"\n`;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
if (metadata.description) {
|
| 141 |
+
frontmatter += `description: "${metadata.description}"\n`;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
if (metadata.licence) {
|
| 145 |
+
frontmatter += `licence: >\n ${metadata.licence}\n`;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
if (metadata.tags && metadata.tags.length > 0) {
|
| 149 |
+
frontmatter += 'tags:\n';
|
| 150 |
+
metadata.tags.forEach(tag => {
|
| 151 |
+
frontmatter += ` - ${tag}\n`;
|
| 152 |
+
});
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
// Default Astro configuration
|
| 156 |
+
frontmatter += 'tableOfContentsAutoCollapse: true\n';
|
| 157 |
+
frontmatter += '---\n\n';
|
| 158 |
+
|
| 159 |
+
return frontmatter;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
/**
|
| 163 |
+
* Extract and generate frontmatter from LaTeX content
|
| 164 |
+
* @param {string} latexContent - Raw LaTeX content
|
| 165 |
+
* @returns {string} - Complete YAML frontmatter
|
| 166 |
+
*/
|
| 167 |
+
export function extractAndGenerateFrontmatter(latexContent) {
|
| 168 |
+
const metadata = extractLatexMetadata(latexContent);
|
| 169 |
+
return generateFrontmatter(metadata);
|
| 170 |
+
}
|
app/scripts/latex-importer/package-lock.json
ADDED
|
Binary file (56.7 kB). View file
|
|
|
app/scripts/latex-importer/package.json
ADDED
|
Binary file (967 Bytes). View file
|
|
|
app/scripts/latex-importer/post-processor.mjs
ADDED
|
@@ -0,0 +1,439 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { readFileSync, writeFileSync, existsSync, readdirSync } from 'fs';
|
| 4 |
+
import { join, dirname } from 'path';
|
| 5 |
+
import { fileURLToPath } from 'url';
|
| 6 |
+
|
| 7 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 8 |
+
const __dirname = dirname(__filename);
|
| 9 |
+
|
| 10 |
+
/**
|
| 11 |
+
* Post-processor for cleaning Markdown content from LaTeX conversion
|
| 12 |
+
* Each function handles a specific type of cleanup for maintainability
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
/**
|
| 16 |
+
* Remove TeX low-level grouping commands that break KaTeX
|
| 17 |
+
* @param {string} content - Markdown content
|
| 18 |
+
* @returns {string} - Cleaned content
|
| 19 |
+
*/
|
| 20 |
+
function removeTexGroupingCommands(content) {
|
| 21 |
+
console.log(' 🧹 Removing TeX grouping commands...');
|
| 22 |
+
|
| 23 |
+
return content
|
| 24 |
+
.replace(/\\mathopen\{\}\\mathclose\\bgroup/g, '')
|
| 25 |
+
.replace(/\\aftergroup\\egroup/g, '')
|
| 26 |
+
.replace(/\\bgroup/g, '')
|
| 27 |
+
.replace(/\\egroup/g, '');
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
/**
|
| 31 |
+
* Simplify LaTeX delimiter constructions
|
| 32 |
+
* @param {string} content - Markdown content
|
| 33 |
+
* @returns {string} - Cleaned content
|
| 34 |
+
*/
|
| 35 |
+
function simplifyLatexDelimiters(content) {
|
| 36 |
+
console.log(' 🔧 Simplifying LaTeX delimiters...');
|
| 37 |
+
|
| 38 |
+
return content
|
| 39 |
+
.replace(/\\left\[\s*/g, '[')
|
| 40 |
+
.replace(/\s*\\right\]/g, ']');
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
/**
|
| 44 |
+
* Remove orphaned LaTeX labels
|
| 45 |
+
* @param {string} content - Markdown content
|
| 46 |
+
* @returns {string} - Cleaned content
|
| 47 |
+
*/
|
| 48 |
+
function removeOrphanedLabels(content) {
|
| 49 |
+
console.log(' 🏷️ Removing orphaned labels...');
|
| 50 |
+
|
| 51 |
+
return content
|
| 52 |
+
.replace(/^\s*\\label\{[^}]+\}\s*$/gm, '')
|
| 53 |
+
.replace(/\\label\{[^}]+\}/g, '');
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
/**
|
| 57 |
+
* Fix KaTeX-incompatible math commands
|
| 58 |
+
* @param {string} content - Markdown content
|
| 59 |
+
* @returns {string} - Cleaned content
|
| 60 |
+
*/
|
| 61 |
+
function fixMathCommands(content) {
|
| 62 |
+
console.log(' 📐 Fixing KaTeX-incompatible math commands...');
|
| 63 |
+
|
| 64 |
+
return content
|
| 65 |
+
// Replace \hdots with \ldots (KaTeX compatible)
|
| 66 |
+
.replace(/\\hdots/g, '\\ldots')
|
| 67 |
+
// Add more math command fixes here as needed
|
| 68 |
+
.replace(/\\vdots/g, '\\vdots'); // This one should be fine, but kept for consistency
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Convert LaTeX matrix commands to KaTeX-compatible environments
|
| 73 |
+
* @param {string} content - Markdown content
|
| 74 |
+
* @returns {string} - Content with fixed matrix commands
|
| 75 |
+
*/
|
| 76 |
+
function fixMatrixCommands(content) {
|
| 77 |
+
console.log(' 🔢 Converting matrix commands to KaTeX format...');
|
| 78 |
+
|
| 79 |
+
let fixedCount = 0;
|
| 80 |
+
|
| 81 |
+
// Convert \pmatrix{...} to \begin{pmatrix}...\end{pmatrix}
|
| 82 |
+
content = content.replace(/\\pmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => {
|
| 83 |
+
fixedCount++;
|
| 84 |
+
// Split by \\ for rows, handle nested braces
|
| 85 |
+
const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row);
|
| 86 |
+
return `\\begin{pmatrix}\n${rows.join(' \\\\\n')}\n\\end{pmatrix}`;
|
| 87 |
+
});
|
| 88 |
+
|
| 89 |
+
// Convert \bmatrix{...} to \begin{bmatrix}...\end{bmatrix}
|
| 90 |
+
content = content.replace(/\\bmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => {
|
| 91 |
+
fixedCount++;
|
| 92 |
+
const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row);
|
| 93 |
+
return `\\begin{bmatrix}\n${rows.join(' \\\\\n')}\n\\end{bmatrix}`;
|
| 94 |
+
});
|
| 95 |
+
|
| 96 |
+
// Convert \vmatrix{...} to \begin{vmatrix}...\end{vmatrix}
|
| 97 |
+
content = content.replace(/\\vmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => {
|
| 98 |
+
fixedCount++;
|
| 99 |
+
const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row);
|
| 100 |
+
return `\\begin{vmatrix}\n${rows.join(' \\\\\n')}\n\\end{vmatrix}`;
|
| 101 |
+
});
|
| 102 |
+
|
| 103 |
+
if (fixedCount > 0) {
|
| 104 |
+
console.log(` ✅ Fixed ${fixedCount} matrix command(s)`);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
return content;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
/**
|
| 111 |
+
* Fix Unicode characters that break MDX/JSX parsing
|
| 112 |
+
* @param {string} content - Markdown content
|
| 113 |
+
* @returns {string} - Cleaned content
|
| 114 |
+
*/
|
| 115 |
+
function fixUnicodeIssues(content) {
|
| 116 |
+
console.log(' 🌐 Fixing Unicode characters for MDX compatibility...');
|
| 117 |
+
|
| 118 |
+
return content
|
| 119 |
+
// Replace Unicode middle dot (·) with \cdot in math expressions
|
| 120 |
+
.replace(/\$([^$]*?)·([^$]*?)\$/g, (match, before, after) => {
|
| 121 |
+
return `$${before}\\cdot${after}$`;
|
| 122 |
+
})
|
| 123 |
+
// Replace Unicode middle dot in display math
|
| 124 |
+
.replace(/\$\$([^$]*?)·([^$]*?)\$\$/g, (match, before, after) => {
|
| 125 |
+
return `$$${before}\\cdot${after}$$`;
|
| 126 |
+
})
|
| 127 |
+
// Replace other problematic Unicode characters
|
| 128 |
+
.replace(/[""]/g, '"') // Smart quotes to regular quotes
|
| 129 |
+
.replace(/['']/g, "'") // Smart apostrophes to regular apostrophes
|
| 130 |
+
.replace(/…/g, '...') // Ellipsis to three dots
|
| 131 |
+
.replace(/–/g, '-') // En dash to hyphen
|
| 132 |
+
.replace(/—/g, '--'); // Em dash to double hyphen
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
/**
|
| 136 |
+
* Fix multiline math expressions for MDX compatibility
|
| 137 |
+
* @param {string} content - Markdown content
|
| 138 |
+
* @returns {string} - Cleaned content
|
| 139 |
+
*/
|
| 140 |
+
function fixMultilineMath(content) {
|
| 141 |
+
console.log(' 📏 Fixing multiline math expressions for MDX...');
|
| 142 |
+
|
| 143 |
+
return content
|
| 144 |
+
// Convert multiline inline math to display math blocks (more precise regex)
|
| 145 |
+
// Only match if the content is a self-contained math expression within a single line
|
| 146 |
+
.replace(/\$([^$\n]*\\\\[^$\n]*)\$/g, (match, mathContent) => {
|
| 147 |
+
// Only convert if it contains actual math operators and line breaks
|
| 148 |
+
if (mathContent.includes('\\\\') && /[=+\-*/^_{}]/.test(mathContent)) {
|
| 149 |
+
// Remove leading/trailing whitespace and normalize newlines
|
| 150 |
+
const cleanedMath = mathContent
|
| 151 |
+
.replace(/^\s+|\s+$/g, '')
|
| 152 |
+
.replace(/\s*\\\\\s*/g, '\\\\\n ');
|
| 153 |
+
return `$$\n${cleanedMath}\n$$`;
|
| 154 |
+
}
|
| 155 |
+
return match; // Keep original if it doesn't look like multiline math
|
| 156 |
+
})
|
| 157 |
+
// Ensure display math blocks are properly separated
|
| 158 |
+
.replace(/\$\$\s*\n\s*([^$]+?)\s*\n\s*\$\$/g, (match, mathContent) => {
|
| 159 |
+
return `\n$$\n${mathContent.trim()}\n$$\n`;
|
| 160 |
+
});
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
/**
|
| 164 |
+
* Inject code snippets into empty code blocks
|
| 165 |
+
* @param {string} content - Markdown content
|
| 166 |
+
* @param {string} inputDir - Directory containing the LaTeX source and snippets
|
| 167 |
+
* @returns {string} - Content with injected code snippets
|
| 168 |
+
*/
|
| 169 |
+
function injectCodeSnippets(content, inputDir = null) {
|
| 170 |
+
console.log(' 💻 Injecting code snippets...');
|
| 171 |
+
|
| 172 |
+
if (!inputDir) {
|
| 173 |
+
console.log(' ⚠️ No input directory provided, skipping code injection');
|
| 174 |
+
return content;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
const snippetsDir = join(inputDir, 'snippets');
|
| 178 |
+
|
| 179 |
+
if (!existsSync(snippetsDir)) {
|
| 180 |
+
console.log(' ⚠️ Snippets directory not found, skipping code injection');
|
| 181 |
+
return content;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
// Get all available snippet files
|
| 185 |
+
let availableSnippets = [];
|
| 186 |
+
try {
|
| 187 |
+
availableSnippets = readdirSync(snippetsDir);
|
| 188 |
+
console.log(` 📁 Found ${availableSnippets.length} snippet file(s): ${availableSnippets.join(', ')}`);
|
| 189 |
+
} catch (error) {
|
| 190 |
+
console.log(` ❌ Error reading snippets directory: ${error.message}`);
|
| 191 |
+
return content;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
// Find all empty code blocks
|
| 195 |
+
const emptyCodeBlockPattern = /```\s*(\w+)\s*\n\s*```/g;
|
| 196 |
+
|
| 197 |
+
let processedContent = content;
|
| 198 |
+
let injectionCount = 0;
|
| 199 |
+
|
| 200 |
+
processedContent = processedContent.replace(emptyCodeBlockPattern, (match, language) => {
|
| 201 |
+
// Map language names to file extensions
|
| 202 |
+
const extensionMap = {
|
| 203 |
+
'python': 'py',
|
| 204 |
+
'javascript': 'js',
|
| 205 |
+
'typescript': 'ts',
|
| 206 |
+
'bash': 'sh',
|
| 207 |
+
'shell': 'sh'
|
| 208 |
+
};
|
| 209 |
+
|
| 210 |
+
const fileExtension = extensionMap[language] || language;
|
| 211 |
+
|
| 212 |
+
// Try to find a matching snippet file for this language
|
| 213 |
+
const matchingFiles = availableSnippets.filter(file =>
|
| 214 |
+
file.endsWith(`.${fileExtension}`)
|
| 215 |
+
);
|
| 216 |
+
|
| 217 |
+
if (matchingFiles.length === 0) {
|
| 218 |
+
console.log(` ⚠️ No ${language} snippet found (looking for .${fileExtension})`);
|
| 219 |
+
return match;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
// Use the first matching file (could be made smarter with context analysis)
|
| 223 |
+
const selectedFile = matchingFiles[0];
|
| 224 |
+
const snippetPath = join(snippetsDir, selectedFile);
|
| 225 |
+
|
| 226 |
+
try {
|
| 227 |
+
const snippetContent = readFileSync(snippetPath, 'utf8');
|
| 228 |
+
injectionCount++;
|
| 229 |
+
console.log(` ✅ Injected: ${selectedFile}`);
|
| 230 |
+
return `\`\`\`${language}\n${snippetContent.trim()}\n\`\`\``;
|
| 231 |
+
} catch (error) {
|
| 232 |
+
console.log(` ❌ Error reading ${selectedFile}: ${error.message}`);
|
| 233 |
+
return match;
|
| 234 |
+
}
|
| 235 |
+
});
|
| 236 |
+
|
| 237 |
+
if (injectionCount > 0) {
|
| 238 |
+
console.log(` 📊 Injected ${injectionCount} code snippet(s)`);
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
return processedContent;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
/**
|
| 245 |
+
* Fix all attributes that still contain colons (href, data-reference, id)
|
| 246 |
+
* @param {string} content - Markdown content
|
| 247 |
+
* @returns {string} - Cleaned content
|
| 248 |
+
*/
|
| 249 |
+
function fixAllAttributes(content) {
|
| 250 |
+
console.log(' 🔗 Fixing all attributes with colons...');
|
| 251 |
+
|
| 252 |
+
let fixedCount = 0;
|
| 253 |
+
|
| 254 |
+
// Fix href attributes containing colons
|
| 255 |
+
content = content.replace(/href="([^"]*):([^"]*)"/g, (match, before, after) => {
|
| 256 |
+
fixedCount++;
|
| 257 |
+
return `href="${before}-${after}"`;
|
| 258 |
+
});
|
| 259 |
+
|
| 260 |
+
// Fix data-reference attributes containing colons
|
| 261 |
+
content = content.replace(/data-reference="([^"]*):([^"]*)"/g, (match, before, after) => {
|
| 262 |
+
fixedCount++;
|
| 263 |
+
return `data-reference="${before}-${after}"`;
|
| 264 |
+
});
|
| 265 |
+
|
| 266 |
+
// Fix id attributes containing colons (like in Figure components)
|
| 267 |
+
content = content.replace(/id="([^"]*):([^"]*)"/g, (match, before, after) => {
|
| 268 |
+
fixedCount++;
|
| 269 |
+
return `id="${before}-${after}"`;
|
| 270 |
+
});
|
| 271 |
+
|
| 272 |
+
if (fixedCount > 0) {
|
| 273 |
+
console.log(` ✅ Fixed ${fixedCount} attribute(s) with colons`);
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
return content;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
/**
|
| 280 |
+
* Fix link text content that still contains colons
|
| 281 |
+
* @param {string} content - Markdown content
|
| 282 |
+
* @returns {string} - Cleaned content
|
| 283 |
+
*/
|
| 284 |
+
function fixLinkTextContent(content) {
|
| 285 |
+
console.log(' 📝 Fixing link text content with colons...');
|
| 286 |
+
|
| 287 |
+
let fixedCount = 0;
|
| 288 |
+
|
| 289 |
+
// Fix text content within links that contain references with colons
|
| 290 |
+
// Pattern: <a ...>[text:content]</a>
|
| 291 |
+
const cleanedContent = content.replace(/<a([^>]*)>\[([^:]*):([^\]]*)\]<\/a>/g, (match, attributes, before, after) => {
|
| 292 |
+
fixedCount++;
|
| 293 |
+
return `<a${attributes}>[${before}-${after}]</a>`;
|
| 294 |
+
});
|
| 295 |
+
|
| 296 |
+
if (fixedCount > 0) {
|
| 297 |
+
console.log(` ✅ Fixed ${fixedCount} link text(s) with colons`);
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
return cleanedContent;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
/**
|
| 304 |
+
* Convert align anchor markers to proper HTML spans outside math blocks
|
| 305 |
+
* @param {string} content - Markdown content
|
| 306 |
+
* @returns {string} - Content with converted anchor spans
|
| 307 |
+
*/
|
| 308 |
+
function convertAlignAnchors(content) {
|
| 309 |
+
console.log(' 🏷️ Converting align anchor markers to HTML spans...');
|
| 310 |
+
|
| 311 |
+
let convertedCount = 0;
|
| 312 |
+
|
| 313 |
+
// Find and replace align anchor markers with proper spans outside math blocks
|
| 314 |
+
content = content.replace(/``` math\n%%ALIGN_ANCHOR_ID\{([^}]+)\}%%\n([\s\S]*?)\n```/g, (match, anchorId, mathContent) => {
|
| 315 |
+
convertedCount++;
|
| 316 |
+
return `<span id="${anchorId}" style="position: absolute;"></span>\n\n\`\`\` math\n${mathContent}\n\`\`\``;
|
| 317 |
+
});
|
| 318 |
+
|
| 319 |
+
if (convertedCount > 0) {
|
| 320 |
+
console.log(` ✅ Converted ${convertedCount} align anchor marker(s) to spans`);
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
return content;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
/**
|
| 327 |
+
* Main post-processing function that applies all cleanup steps
|
| 328 |
+
* @param {string} content - Raw Markdown content from Pandoc
|
| 329 |
+
* @param {string} inputDir - Optional: Directory containing LaTeX source for code injection
|
| 330 |
+
* @returns {string} - Cleaned Markdown content
|
| 331 |
+
*/
|
| 332 |
+
export function postProcessMarkdown(content, inputDir = null) {
|
| 333 |
+
console.log('🔧 Post-processing for KaTeX compatibility...');
|
| 334 |
+
|
| 335 |
+
let processedContent = content;
|
| 336 |
+
|
| 337 |
+
// Apply each cleanup step sequentially
|
| 338 |
+
processedContent = removeTexGroupingCommands(processedContent);
|
| 339 |
+
processedContent = simplifyLatexDelimiters(processedContent);
|
| 340 |
+
processedContent = removeOrphanedLabels(processedContent);
|
| 341 |
+
processedContent = convertAlignAnchors(processedContent);
|
| 342 |
+
processedContent = fixMathCommands(processedContent);
|
| 343 |
+
processedContent = fixMatrixCommands(processedContent);
|
| 344 |
+
processedContent = fixUnicodeIssues(processedContent);
|
| 345 |
+
processedContent = fixMultilineMath(processedContent);
|
| 346 |
+
processedContent = fixAllAttributes(processedContent);
|
| 347 |
+
processedContent = fixLinkTextContent(processedContent);
|
| 348 |
+
|
| 349 |
+
// Inject code snippets if input directory is provided
|
| 350 |
+
if (inputDir) {
|
| 351 |
+
processedContent = injectCodeSnippets(processedContent, inputDir);
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
return processedContent;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
/**
|
| 358 |
+
* CLI interface for standalone usage
|
| 359 |
+
*/
|
| 360 |
+
function parseArgs() {
|
| 361 |
+
const args = process.argv.slice(2);
|
| 362 |
+
const config = {
|
| 363 |
+
input: join(__dirname, 'output', 'main.md'),
|
| 364 |
+
output: null, // Will default to input if not specified
|
| 365 |
+
verbose: false,
|
| 366 |
+
};
|
| 367 |
+
|
| 368 |
+
for (const arg of args) {
|
| 369 |
+
if (arg.startsWith('--input=')) {
|
| 370 |
+
config.input = arg.substring('--input='.length);
|
| 371 |
+
} else if (arg.startsWith('--output=')) {
|
| 372 |
+
config.output = arg.substring('--output='.length);
|
| 373 |
+
} else if (arg === '--verbose') {
|
| 374 |
+
config.verbose = true;
|
| 375 |
+
} else if (arg === '--help' || arg === '-h') {
|
| 376 |
+
console.log(`
|
| 377 |
+
🔧 Markdown Post-Processor
|
| 378 |
+
|
| 379 |
+
Usage:
|
| 380 |
+
node post-processor.mjs [options]
|
| 381 |
+
|
| 382 |
+
Options:
|
| 383 |
+
--input=PATH Input Markdown file (default: output/main.md)
|
| 384 |
+
--output=PATH Output file (default: overwrites input)
|
| 385 |
+
--verbose Verbose output
|
| 386 |
+
--help, -h Show this help
|
| 387 |
+
|
| 388 |
+
Examples:
|
| 389 |
+
# Process main.md in-place
|
| 390 |
+
node post-processor.mjs
|
| 391 |
+
|
| 392 |
+
# Process with custom paths
|
| 393 |
+
node post-processor.mjs --input=raw.md --output=clean.md
|
| 394 |
+
`);
|
| 395 |
+
process.exit(0);
|
| 396 |
+
}
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
// Default output to input if not specified
|
| 400 |
+
if (!config.output) {
|
| 401 |
+
config.output = config.input;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
return config;
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
function main() {
|
| 408 |
+
const config = parseArgs();
|
| 409 |
+
|
| 410 |
+
console.log('🔧 Markdown Post-Processor');
|
| 411 |
+
console.log(`📁 Input: ${config.input}`);
|
| 412 |
+
console.log(`📁 Output: ${config.output}`);
|
| 413 |
+
|
| 414 |
+
try {
|
| 415 |
+
const content = readFileSync(config.input, 'utf8');
|
| 416 |
+
const processedContent = postProcessMarkdown(content);
|
| 417 |
+
|
| 418 |
+
writeFileSync(config.output, processedContent);
|
| 419 |
+
|
| 420 |
+
console.log(`✅ Post-processing completed: ${config.output}`);
|
| 421 |
+
|
| 422 |
+
// Show stats if verbose
|
| 423 |
+
if (config.verbose) {
|
| 424 |
+
const originalLines = content.split('\n').length;
|
| 425 |
+
const processedLines = processedContent.split('\n').length;
|
| 426 |
+
console.log(`📊 Lines: ${originalLines} → ${processedLines}`);
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
} catch (error) {
|
| 430 |
+
console.error('❌ Post-processing failed:');
|
| 431 |
+
console.error(error.message);
|
| 432 |
+
process.exit(1);
|
| 433 |
+
}
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
// Run CLI if called directly
|
| 437 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 438 |
+
main();
|
| 439 |
+
}
|
app/scripts/latex-importer/reference-preprocessor.mjs
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
/**
|
| 4 |
+
* LaTeX Reference Preprocessor
|
| 5 |
+
*
|
| 6 |
+
* This module cleans up LaTeX references BEFORE Pandoc conversion to ensure
|
| 7 |
+
* consistent, MDX-compatible identifiers throughout the document.
|
| 8 |
+
*
|
| 9 |
+
* What it does:
|
| 10 |
+
* - Removes prefixes from labels: \label{sec:intro} → \label{sec-intro}
|
| 11 |
+
* - Updates corresponding refs: \ref{sec:intro} → \ref{sec-intro}
|
| 12 |
+
* - Handles all reference types: sec:, fig:, eq:, table:, etc.
|
| 13 |
+
* - Maintains consistency between labels and references
|
| 14 |
+
*/
|
| 15 |
+
|
| 16 |
+
/**
|
| 17 |
+
* Extract all references from LaTeX content
|
| 18 |
+
* @param {string} content - LaTeX content
|
| 19 |
+
* @returns {Object} - Object with labels and refs arrays
|
| 20 |
+
*/
|
| 21 |
+
function extractReferences(content) {
|
| 22 |
+
const references = {
|
| 23 |
+
labels: new Set(),
|
| 24 |
+
refs: new Set(),
|
| 25 |
+
cites: new Set()
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
// Find all \label{...} commands
|
| 29 |
+
const labelMatches = content.matchAll(/\\label\{([^}]+)\}/g);
|
| 30 |
+
for (const match of labelMatches) {
|
| 31 |
+
references.labels.add(match[1]);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
// Find all \ref{...} commands
|
| 35 |
+
const refMatches = content.matchAll(/\\ref\{([^}]+)\}/g);
|
| 36 |
+
for (const match of refMatches) {
|
| 37 |
+
references.refs.add(match[1]);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
// Find all \cite{...} commands (already handled in existing code but included for completeness)
|
| 41 |
+
const citeMatches = content.matchAll(/\\cite[tp]?\{([^}]+)\}/g);
|
| 42 |
+
for (const match of citeMatches) {
|
| 43 |
+
// Handle multiple citations: \cite{ref1,ref2,ref3}
|
| 44 |
+
const citations = match[1].split(',').map(cite => cite.trim());
|
| 45 |
+
citations.forEach(cite => references.cites.add(cite));
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
return references;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/**
|
| 52 |
+
* Create clean identifier mapping
|
| 53 |
+
* @param {Object} references - References object from extractReferences
|
| 54 |
+
* @returns {Map} - Mapping from original to clean identifiers
|
| 55 |
+
*/
|
| 56 |
+
function createCleanMapping(references) {
|
| 57 |
+
const mapping = new Map();
|
| 58 |
+
|
| 59 |
+
// Create mapping for all unique identifiers
|
| 60 |
+
const allIdentifiers = new Set([
|
| 61 |
+
...references.labels,
|
| 62 |
+
...references.refs
|
| 63 |
+
]);
|
| 64 |
+
|
| 65 |
+
for (const id of allIdentifiers) {
|
| 66 |
+
// Remove common prefixes and replace colons with dashes
|
| 67 |
+
let cleanId = id
|
| 68 |
+
.replace(/^(sec|section|ch|chapter|fig|figure|eq|equation|tab|table|lst|listing|app|appendix):/gi, '')
|
| 69 |
+
.replace(/:/g, '-')
|
| 70 |
+
.replace(/[^a-zA-Z0-9_-]/g, '-') // Replace any other problematic characters
|
| 71 |
+
.replace(/-+/g, '-') // Collapse multiple dashes
|
| 72 |
+
.replace(/^-|-$/g, ''); // Remove leading/trailing dashes
|
| 73 |
+
|
| 74 |
+
// Ensure we don't have empty identifiers
|
| 75 |
+
if (!cleanId) {
|
| 76 |
+
cleanId = id.replace(/:/g, '-');
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
mapping.set(id, cleanId);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
return mapping;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
/**
|
| 86 |
+
* Convert labels to HTML anchor spans for better MDX compatibility
|
| 87 |
+
* @param {string} content - LaTeX content
|
| 88 |
+
* @param {Map} mapping - Identifier mapping (original -> clean)
|
| 89 |
+
* @returns {Object} - Result with content and count of conversions
|
| 90 |
+
*/
|
| 91 |
+
function convertLabelsToAnchors(content, mapping) {
|
| 92 |
+
let processedContent = content;
|
| 93 |
+
let anchorsCreated = 0;
|
| 94 |
+
|
| 95 |
+
// Replace \label{...} with HTML anchor spans, but SKIP labels inside math environments
|
| 96 |
+
for (const [original, clean] of mapping) {
|
| 97 |
+
// Skip equation labels (they will be handled by the Lua filter)
|
| 98 |
+
if (original.startsWith('eq:')) {
|
| 99 |
+
continue;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
const labelRegex = new RegExp(`\\\\label\\{${escapeRegex(original)}\\}`, 'g');
|
| 103 |
+
const labelMatches = processedContent.match(labelRegex);
|
| 104 |
+
|
| 105 |
+
if (labelMatches) {
|
| 106 |
+
// Replace \label{original} with HTML span anchor (invisible but accessible)
|
| 107 |
+
processedContent = processedContent.replace(labelRegex, `\n\n<span id="${clean}" style="position: absolute;"></span>\n\n`);
|
| 108 |
+
anchorsCreated += labelMatches.length;
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
return { content: processedContent, anchorsCreated };
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
/**
|
| 116 |
+
* Convert \highlight{...} commands to HTML spans with CSS class
|
| 117 |
+
* @param {string} content - LaTeX content
|
| 118 |
+
* @returns {Object} - Result with content and count of conversions
|
| 119 |
+
*/
|
| 120 |
+
function convertHighlightCommands(content) {
|
| 121 |
+
let processedContent = content;
|
| 122 |
+
let highlightsConverted = 0;
|
| 123 |
+
|
| 124 |
+
// Replace \highlight{...} with <span class="highlight">...</span>
|
| 125 |
+
processedContent = processedContent.replace(/\\highlight\{([^}]+)\}/g, (match, text) => {
|
| 126 |
+
highlightsConverted++;
|
| 127 |
+
return `<span class="highlight">${text}</span>`;
|
| 128 |
+
});
|
| 129 |
+
|
| 130 |
+
return { content: processedContent, highlightsConverted };
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
/**
|
| 134 |
+
* Apply mapping to LaTeX content
|
| 135 |
+
* @param {string} content - Original LaTeX content
|
| 136 |
+
* @param {Map} mapping - Identifier mapping
|
| 137 |
+
* @returns {string} - Cleaned LaTeX content
|
| 138 |
+
*/
|
| 139 |
+
function applyMapping(content, mapping) {
|
| 140 |
+
let cleanedContent = content;
|
| 141 |
+
let changesCount = 0;
|
| 142 |
+
|
| 143 |
+
// First, convert labels to anchor spans
|
| 144 |
+
const anchorResult = convertLabelsToAnchors(cleanedContent, mapping);
|
| 145 |
+
cleanedContent = anchorResult.content;
|
| 146 |
+
const anchorsCreated = anchorResult.anchorsCreated;
|
| 147 |
+
|
| 148 |
+
// Convert \highlight{} commands to spans
|
| 149 |
+
const highlightResult = convertHighlightCommands(cleanedContent);
|
| 150 |
+
cleanedContent = highlightResult.content;
|
| 151 |
+
const highlightsConverted = highlightResult.highlightsConverted;
|
| 152 |
+
|
| 153 |
+
// Then apply mapping to remaining references and equation labels
|
| 154 |
+
for (const [original, clean] of mapping) {
|
| 155 |
+
if (original !== clean) {
|
| 156 |
+
// Replace \ref{original} with \ref{clean}
|
| 157 |
+
const refRegex = new RegExp(`\\\\ref\\{${escapeRegex(original)}\\}`, 'g');
|
| 158 |
+
const refMatches = cleanedContent.match(refRegex);
|
| 159 |
+
if (refMatches) {
|
| 160 |
+
cleanedContent = cleanedContent.replace(refRegex, `\\ref{${clean}}`);
|
| 161 |
+
changesCount += refMatches.length;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
// For equation labels, still clean the labels themselves (for the Lua filter)
|
| 165 |
+
if (original.startsWith('eq:')) {
|
| 166 |
+
const labelRegex = new RegExp(`\\\\label\\{${escapeRegex(original)}\\}`, 'g');
|
| 167 |
+
const labelMatches = cleanedContent.match(labelRegex);
|
| 168 |
+
if (labelMatches) {
|
| 169 |
+
cleanedContent = cleanedContent.replace(labelRegex, `\\label{${clean}}`);
|
| 170 |
+
changesCount += labelMatches.length;
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
return {
|
| 177 |
+
content: cleanedContent,
|
| 178 |
+
changesCount: changesCount + anchorsCreated,
|
| 179 |
+
highlightsConverted: highlightsConverted
|
| 180 |
+
};
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
/**
|
| 184 |
+
* Escape special regex characters
|
| 185 |
+
* @param {string} string - String to escape
|
| 186 |
+
* @returns {string} - Escaped string
|
| 187 |
+
*/
|
| 188 |
+
function escapeRegex(string) {
|
| 189 |
+
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
/**
|
| 193 |
+
* Main preprocessing function
|
| 194 |
+
* @param {string} latexContent - Original LaTeX content
|
| 195 |
+
* @returns {Object} - Result with cleaned content and statistics
|
| 196 |
+
*/
|
| 197 |
+
export function preprocessLatexReferences(latexContent) {
|
| 198 |
+
console.log('🔧 Preprocessing LaTeX references for MDX compatibility...');
|
| 199 |
+
|
| 200 |
+
// 1. Extract all references
|
| 201 |
+
const references = extractReferences(latexContent);
|
| 202 |
+
|
| 203 |
+
console.log(` 📊 Found: ${references.labels.size} labels, ${references.refs.size} refs`);
|
| 204 |
+
|
| 205 |
+
// 2. Create clean mapping
|
| 206 |
+
const mapping = createCleanMapping(references);
|
| 207 |
+
|
| 208 |
+
// 3. Apply mapping
|
| 209 |
+
const result = applyMapping(latexContent, mapping);
|
| 210 |
+
|
| 211 |
+
if (result.changesCount > 0) {
|
| 212 |
+
console.log(` ✅ Processed ${result.changesCount} reference(s) and created anchor spans`);
|
| 213 |
+
|
| 214 |
+
// Show some examples of changes
|
| 215 |
+
let exampleCount = 0;
|
| 216 |
+
for (const [original, clean] of mapping) {
|
| 217 |
+
if (original !== clean && exampleCount < 3) {
|
| 218 |
+
console.log(` ${original} → ${clean} (span + refs)`);
|
| 219 |
+
exampleCount++;
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
if (mapping.size > 3) {
|
| 223 |
+
console.log(` ... and ${mapping.size - 3} more anchor spans created`);
|
| 224 |
+
}
|
| 225 |
+
} else {
|
| 226 |
+
console.log(' ℹ️ No reference cleanup needed');
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
if (result.highlightsConverted > 0) {
|
| 230 |
+
console.log(` ✨ Converted ${result.highlightsConverted} \\highlight{} command(s) to <span class="highlight">`);
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
return {
|
| 234 |
+
content: result.content,
|
| 235 |
+
changesCount: result.changesCount,
|
| 236 |
+
mapping: mapping,
|
| 237 |
+
references: references
|
| 238 |
+
};
|
| 239 |
+
}
|
app/scripts/notion-importer/.cursorignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.env
|
app/scripts/notion-importer/README.md
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Notion Importer
|
| 2 |
+
|
| 3 |
+
Complete Notion to MDX (Markdown + JSX) importer optimized for Astro with advanced media handling, interactive components, and seamless integration.
|
| 4 |
+
|
| 5 |
+
## 🚀 Quick Start
|
| 6 |
+
|
| 7 |
+
### Method 1: Using NOTION_PAGE_ID (Recommended)
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
# Install dependencies
|
| 11 |
+
npm install
|
| 12 |
+
|
| 13 |
+
# Setup environment variables
|
| 14 |
+
cp env.example .env
|
| 15 |
+
# Edit .env with your Notion token and page ID
|
| 16 |
+
|
| 17 |
+
# Complete Notion → MDX conversion (fetches title/slug automatically)
|
| 18 |
+
NOTION_TOKEN=secret_xxx NOTION_PAGE_ID=abc123 node index.mjs
|
| 19 |
+
|
| 20 |
+
# Or use .env file
|
| 21 |
+
node index.mjs
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
### Method 2: Using pages.json (Legacy)
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
# Install dependencies
|
| 28 |
+
npm install
|
| 29 |
+
|
| 30 |
+
# Setup environment variables
|
| 31 |
+
cp env.example .env
|
| 32 |
+
# Edit .env with your Notion token
|
| 33 |
+
|
| 34 |
+
# Configure pages in input/pages.json
|
| 35 |
+
# {
|
| 36 |
+
# "pages": [
|
| 37 |
+
# {
|
| 38 |
+
# "id": "your-page-id",
|
| 39 |
+
# "title": "Title",
|
| 40 |
+
# "slug": "slug"
|
| 41 |
+
# }
|
| 42 |
+
# ]
|
| 43 |
+
# }
|
| 44 |
+
|
| 45 |
+
# Complete Notion → MDX conversion
|
| 46 |
+
node index.mjs
|
| 47 |
+
|
| 48 |
+
# For step-by-step debugging
|
| 49 |
+
node notion-converter.mjs # Notion → Markdown
|
| 50 |
+
node mdx-converter.mjs # Markdown → MDX
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## 📁 Structure
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
notion-importer/
|
| 57 |
+
├── index.mjs # Complete Notion → MDX pipeline
|
| 58 |
+
├── notion-converter.mjs # Notion → Markdown with notion-to-md v4
|
| 59 |
+
├── mdx-converter.mjs # Markdown → MDX with Astro components
|
| 60 |
+
├── post-processor.mjs # Markdown post-processing
|
| 61 |
+
├── package.json # Dependencies and scripts
|
| 62 |
+
├── env.example # Environment variables template
|
| 63 |
+
├── static/ # Static files injected at build time
|
| 64 |
+
│ ├── frontmatter.mdx # Static frontmatter (overrides all others)
|
| 65 |
+
│ └── bibliography.bib # Static bibliography
|
| 66 |
+
├── input/ # Configuration
|
| 67 |
+
│ └── pages.json # Notion pages to convert
|
| 68 |
+
└── output/ # Results
|
| 69 |
+
├── *.md # Intermediate Markdown
|
| 70 |
+
├── *.mdx # Final MDX for Astro
|
| 71 |
+
└── media/ # Downloaded media files
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## ✨ Key Features
|
| 75 |
+
|
| 76 |
+
### 🎯 **Advanced Media Handling**
|
| 77 |
+
- **Local download**: Automatic download of all Notion media (images, files, PDFs)
|
| 78 |
+
- **Path transformation**: Smart path conversion for web accessibility
|
| 79 |
+
- **Image components**: Automatic conversion to Astro `Image` components with zoom/download
|
| 80 |
+
- **Media organization**: Structured media storage by page ID
|
| 81 |
+
|
| 82 |
+
### 🧮 **Interactive Components**
|
| 83 |
+
- **Callouts → Notes**: Notion callouts converted to Astro `Note` components
|
| 84 |
+
- **Enhanced tables**: Tables wrapped in styled containers
|
| 85 |
+
- **Code blocks**: Enhanced with copy functionality
|
| 86 |
+
- **Automatic imports**: Smart component and image import generation
|
| 87 |
+
|
| 88 |
+
### 🎨 **Smart Formatting**
|
| 89 |
+
- **Link fixing**: Notion internal links converted to relative links
|
| 90 |
+
- **Artifact cleanup**: Removal of Notion-specific formatting artifacts
|
| 91 |
+
- **Static frontmatter**: Priority injection of custom frontmatter from `static/frontmatter.mdx`
|
| 92 |
+
- **Static bibliography**: Automatic copying of `static/bibliography.bib`
|
| 93 |
+
- **Astro compatibility**: Full compatibility with Astro MDX processing
|
| 94 |
+
|
| 95 |
+
### 🔧 **Robust Pipeline**
|
| 96 |
+
- **Notion preprocessing**: Advanced page configuration and media strategy
|
| 97 |
+
- **Post-processing**: Markdown cleanup and optimization
|
| 98 |
+
- **MDX conversion**: Final transformation with Astro components
|
| 99 |
+
- **Auto-copy**: Automatic copying to Astro content directory
|
| 100 |
+
|
| 101 |
+
## 📄 Static Files Configuration
|
| 102 |
+
|
| 103 |
+
The importer supports static files for consistent metadata and bibliography:
|
| 104 |
+
|
| 105 |
+
### Frontmatter (`static/frontmatter.mdx`)
|
| 106 |
+
Create this file to override frontmatter across all conversions:
|
| 107 |
+
|
| 108 |
+
```yaml
|
| 109 |
+
---
|
| 110 |
+
title: "My Article Title"
|
| 111 |
+
subtitle: "Optional subtitle"
|
| 112 |
+
description: "Article description for SEO"
|
| 113 |
+
authors:
|
| 114 |
+
- name: "Jane Doe"
|
| 115 |
+
url: "https://example.com"
|
| 116 |
+
affiliations:
|
| 117 |
+
- "Hugging Face"
|
| 118 |
+
tags:
|
| 119 |
+
- AI
|
| 120 |
+
- Research
|
| 121 |
+
doi: "10.1000/182"
|
| 122 |
+
tableOfContentsAutoCollapse: true
|
| 123 |
+
---
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
This static frontmatter takes **highest priority** over any Notion metadata or existing frontmatter.
|
| 127 |
+
|
| 128 |
+
### Bibliography (`static/bibliography.bib`)
|
| 129 |
+
Add your BibTeX entries to be copied to `src/content/bibliography.bib`:
|
| 130 |
+
|
| 131 |
+
```bibtex
|
| 132 |
+
@article{example2024,
|
| 133 |
+
title={Example Article},
|
| 134 |
+
author={Doe, Jane and Smith, John},
|
| 135 |
+
journal={Example Journal},
|
| 136 |
+
year={2024}
|
| 137 |
+
}
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
## 📊 Example Workflow
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
# 1. Configure your Notion pages
|
| 144 |
+
# Edit input/pages.json with your page IDs
|
| 145 |
+
|
| 146 |
+
# 2. Complete automatic conversion
|
| 147 |
+
NOTION_TOKEN=your_token node index.mjs --clean
|
| 148 |
+
|
| 149 |
+
# 3. Generated results
|
| 150 |
+
ls output/
|
| 151 |
+
# → getting-started.md (Intermediate Markdown)
|
| 152 |
+
# → getting-started.mdx (Final MDX for Astro)
|
| 153 |
+
# → media/ (downloaded images and files)
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### 📋 Conversion Result
|
| 157 |
+
|
| 158 |
+
The pipeline generates MDX files optimized for Astro with:
|
| 159 |
+
|
| 160 |
+
```mdx
|
| 161 |
+
---
|
| 162 |
+
title: "Getting Started with Notion"
|
| 163 |
+
published: "2024-01-15"
|
| 164 |
+
tableOfContentsAutoCollapse: true
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
import Image from '../components/Image.astro';
|
| 168 |
+
import Note from '../components/Note.astro';
|
| 169 |
+
import gettingStartedImage from './media/getting-started/image1.png';
|
| 170 |
+
|
| 171 |
+
## Introduction
|
| 172 |
+
|
| 173 |
+
Here is some content with a callout:
|
| 174 |
+
|
| 175 |
+
<Note type="info" title="Important">
|
| 176 |
+
This is a converted Notion callout.
|
| 177 |
+
</Note>
|
| 178 |
+
|
| 179 |
+
And an image:
|
| 180 |
+
|
| 181 |
+
<Figure
|
| 182 |
+
src={gettingStartedImage}
|
| 183 |
+
alt="Getting started screenshot"
|
| 184 |
+
zoomable
|
| 185 |
+
downloadable
|
| 186 |
+
layout="fixed"
|
| 187 |
+
/>
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
## ⚙️ Required Astro Configuration
|
| 191 |
+
|
| 192 |
+
To use the generated MDX files, ensure your Astro project has the required components:
|
| 193 |
+
|
| 194 |
+
```astro
|
| 195 |
+
// src/components/Figure.astro
|
| 196 |
+
---
|
| 197 |
+
export interface Props {
|
| 198 |
+
src: any;
|
| 199 |
+
alt?: string;
|
| 200 |
+
caption?: string;
|
| 201 |
+
zoomable?: boolean;
|
| 202 |
+
downloadable?: boolean;
|
| 203 |
+
layout?: string;
|
| 204 |
+
id?: string;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
const { src, alt, caption, zoomable, downloadable, layout, id } = Astro.props;
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
<figure {id} class="figure">
|
| 211 |
+
<img src={src} alt={alt} />
|
| 212 |
+
{caption && <figcaption>{caption}</figcaption>}
|
| 213 |
+
</figure>
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
## 🛠️ Prerequisites
|
| 217 |
+
|
| 218 |
+
- **Node.js** with ESM support
|
| 219 |
+
- **Notion Integration**: Set up an integration in your Notion workspace
|
| 220 |
+
- **Notion Token**: Copy the "Internal Integration Token"
|
| 221 |
+
- **Shared Pages**: Share the specific Notion page(s) with your integration
|
| 222 |
+
- **Astro** to use the generated MDX
|
| 223 |
+
|
| 224 |
+
## 🎯 Technical Architecture
|
| 225 |
+
|
| 226 |
+
### 4-Stage Pipeline
|
| 227 |
+
|
| 228 |
+
1. **Notion Preprocessing** (`notion-converter.mjs`)
|
| 229 |
+
- Configuration loading from `pages.json`
|
| 230 |
+
- Notion API client initialization
|
| 231 |
+
- Media download strategy configuration
|
| 232 |
+
|
| 233 |
+
2. **Notion-to-Markdown** (notion-to-md v4)
|
| 234 |
+
- Page conversion with `NotionConverter`
|
| 235 |
+
- Media downloading with `downloadMediaTo()`
|
| 236 |
+
- File export with `DefaultExporter`
|
| 237 |
+
|
| 238 |
+
3. **Markdown Post-processing** (`post-processor.mjs`)
|
| 239 |
+
- Notion artifact cleanup
|
| 240 |
+
- Link fixing and optimization
|
| 241 |
+
- Table and code block enhancement
|
| 242 |
+
|
| 243 |
+
4. **MDX Conversion** (`mdx-converter.mjs`)
|
| 244 |
+
- Component transformation (Figure, Note)
|
| 245 |
+
- Automatic import generation
|
| 246 |
+
- Frontmatter enhancement
|
| 247 |
+
- Astro compatibility optimization
|
| 248 |
+
|
| 249 |
+
## 📊 Configuration Options
|
| 250 |
+
|
| 251 |
+
### Pages Configuration (`input/pages.json`)
|
| 252 |
+
|
| 253 |
+
```json
|
| 254 |
+
{
|
| 255 |
+
"pages": [
|
| 256 |
+
{
|
| 257 |
+
"id": "your-notion-page-id",
|
| 258 |
+
"title": "Page Title",
|
| 259 |
+
"slug": "page-slug"
|
| 260 |
+
}
|
| 261 |
+
]
|
| 262 |
+
}
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
### Environment Variables
|
| 266 |
+
|
| 267 |
+
Copy `env.example` to `.env` and configure:
|
| 268 |
+
|
| 269 |
+
```bash
|
| 270 |
+
cp env.example .env
|
| 271 |
+
# Edit .env with your actual Notion token
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
Required variables:
|
| 275 |
+
```bash
|
| 276 |
+
NOTION_TOKEN=secret_your_notion_integration_token_here
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
### Command Line Options
|
| 280 |
+
|
| 281 |
+
```bash
|
| 282 |
+
# Full workflow
|
| 283 |
+
node index.mjs --clean --token=your_token
|
| 284 |
+
|
| 285 |
+
# Notion to Markdown only
|
| 286 |
+
node index.mjs --notion-only
|
| 287 |
+
|
| 288 |
+
# Markdown to MDX only
|
| 289 |
+
node index.mjs --mdx-only
|
| 290 |
+
|
| 291 |
+
# Custom paths
|
| 292 |
+
node index.mjs --input=my-pages.json --output=converted/
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
## 📊 Conversion Statistics
|
| 296 |
+
|
| 297 |
+
For a typical Notion page:
|
| 298 |
+
- **Media files** automatically downloaded and organized
|
| 299 |
+
- **Callouts** converted to interactive Note components
|
| 300 |
+
- **Images** transformed to Figure components with zoom/download
|
| 301 |
+
- **Tables** enhanced with proper styling containers
|
| 302 |
+
- **Code blocks** enhanced with copy functionality
|
| 303 |
+
- **Links** fixed for proper internal navigation
|
| 304 |
+
|
| 305 |
+
## ✅ Project Status
|
| 306 |
+
|
| 307 |
+
### 🎉 **Complete Features**
|
| 308 |
+
- ✅ **Notion → MDX Pipeline**: Full end-to-end functional conversion
|
| 309 |
+
- ✅ **Media Management**: Automatic download and path transformation
|
| 310 |
+
- ✅ **Component Integration**: Seamless Astro component integration
|
| 311 |
+
- ✅ **Smart Formatting**: Intelligent cleanup and optimization
|
| 312 |
+
- ✅ **Robustness**: Error handling and graceful degradation
|
| 313 |
+
- ✅ **Flexibility**: Modular pipeline with step-by-step options
|
| 314 |
+
|
| 315 |
+
### 🚀 **Production Ready**
|
| 316 |
+
The toolkit is now **100% operational** for converting Notion pages to MDX/Astro with all advanced features (media handling, component integration, smart formatting).
|
| 317 |
+
|
| 318 |
+
## 🔗 Integration with notion-to-md v4
|
| 319 |
+
|
| 320 |
+
This toolkit leverages the powerful [notion-to-md v4](https://notionconvert.com/docs/v4/guides/) library with:
|
| 321 |
+
|
| 322 |
+
- **Advanced Media Strategies**: Download, upload, and direct media handling
|
| 323 |
+
- **Custom Renderers**: Block transformers and annotation transformers
|
| 324 |
+
- **Exporter Plugins**: File, buffer, and stdout output options
|
| 325 |
+
- **Database Support**: Full database property and frontmatter transformation
|
| 326 |
+
- **Page References**: Smart internal link handling
|
| 327 |
+
|
| 328 |
+
## 📚 Additional Resources
|
| 329 |
+
|
| 330 |
+
- [notion-to-md v4 Documentation](https://notionconvert.com/docs/v4/guides/)
|
| 331 |
+
- [Notion API Documentation](https://developers.notion.com/)
|
| 332 |
+
- [Astro MDX Documentation](https://docs.astro.build/en/guides/integrations-guide/mdx/)
|
| 333 |
+
- [Media Handling Strategies](https://notionconvert.com/blog/mastering-media-handling-in-notion-to-md-v4-download-upload-and-direct-strategies/)
|
| 334 |
+
- [Frontmatter Transformation](https://notionconvert.com/blog/how-to-convert-notion-properties-to-frontmatter-with-notion-to-md-v4/)
|
app/scripts/notion-importer/env.example
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NOTION_TOKEN=ntn_xxx
|
| 2 |
+
NOTION_PAGE_ID=xxx
|
app/scripts/notion-importer/index.mjs
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { config } from 'dotenv';
|
| 4 |
+
import { join, dirname, basename } from 'path';
|
| 5 |
+
import { fileURLToPath } from 'url';
|
| 6 |
+
import { copyFileSync, existsSync, mkdirSync, readFileSync, writeFileSync, readdirSync, statSync, unlinkSync } from 'fs';
|
| 7 |
+
import { convertNotionToMarkdown } from './notion-converter.mjs';
|
| 8 |
+
import { convertToMdx } from './mdx-converter.mjs';
|
| 9 |
+
import { Client } from '@notionhq/client';
|
| 10 |
+
|
| 11 |
+
// Load environment variables from .env file (but don't override existing ones)
|
| 12 |
+
config({ override: false });
|
| 13 |
+
|
| 14 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 15 |
+
const __dirname = dirname(__filename);
|
| 16 |
+
|
| 17 |
+
// Default configuration
|
| 18 |
+
const DEFAULT_INPUT = join(__dirname, 'input', 'pages.json');
|
| 19 |
+
const DEFAULT_OUTPUT = join(__dirname, 'output');
|
| 20 |
+
const ASTRO_CONTENT_PATH = join(__dirname, '..', '..', 'src', 'content', 'article.mdx');
|
| 21 |
+
const ASTRO_ASSETS_PATH = join(__dirname, '..', '..', 'src', 'content', 'assets', 'image');
|
| 22 |
+
const ASTRO_BIB_PATH = join(__dirname, '..', '..', 'src', 'content', 'bibliography.bib');
|
| 23 |
+
const STATIC_BIB_PATH = join(__dirname, 'static', 'bibliography.bib');
|
| 24 |
+
|
| 25 |
+
function parseArgs() {
|
| 26 |
+
const args = process.argv.slice(2);
|
| 27 |
+
const config = {
|
| 28 |
+
input: DEFAULT_INPUT,
|
| 29 |
+
output: DEFAULT_OUTPUT,
|
| 30 |
+
clean: false,
|
| 31 |
+
notionOnly: false,
|
| 32 |
+
mdxOnly: false,
|
| 33 |
+
token: process.env.NOTION_TOKEN,
|
| 34 |
+
pageId: process.env.NOTION_PAGE_ID
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
for (const arg of args) {
|
| 38 |
+
if (arg.startsWith('--input=')) {
|
| 39 |
+
config.input = arg.split('=')[1];
|
| 40 |
+
} else if (arg.startsWith('--output=')) {
|
| 41 |
+
config.output = arg.split('=')[1];
|
| 42 |
+
} else if (arg.startsWith('--token=')) {
|
| 43 |
+
config.token = arg.split('=')[1];
|
| 44 |
+
} else if (arg.startsWith('--page-id=')) {
|
| 45 |
+
config.pageId = arg.split('=')[1];
|
| 46 |
+
} else if (arg === '--clean') {
|
| 47 |
+
config.clean = true;
|
| 48 |
+
} else if (arg === '--notion-only') {
|
| 49 |
+
config.notionOnly = true;
|
| 50 |
+
} else if (arg === '--mdx-only') {
|
| 51 |
+
config.mdxOnly = true;
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
return config;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
function showHelp() {
|
| 59 |
+
console.log(`
|
| 60 |
+
🚀 Notion to MDX Toolkit
|
| 61 |
+
|
| 62 |
+
Usage:
|
| 63 |
+
node index.mjs [options]
|
| 64 |
+
|
| 65 |
+
Options:
|
| 66 |
+
--input=PATH Input pages configuration file (default: input/pages.json)
|
| 67 |
+
--output=PATH Output directory (default: output/)
|
| 68 |
+
--token=TOKEN Notion API token (or set NOTION_TOKEN env var)
|
| 69 |
+
--clean Clean output directory before processing
|
| 70 |
+
--notion-only Only convert Notion to Markdown (skip MDX conversion)
|
| 71 |
+
--mdx-only Only convert existing Markdown to MDX
|
| 72 |
+
--help, -h Show this help
|
| 73 |
+
|
| 74 |
+
Environment Variables:
|
| 75 |
+
NOTION_TOKEN Your Notion integration token
|
| 76 |
+
|
| 77 |
+
Examples:
|
| 78 |
+
# Full conversion workflow
|
| 79 |
+
NOTION_TOKEN=your_token node index.mjs --clean
|
| 80 |
+
|
| 81 |
+
# Only convert Notion pages to Markdown
|
| 82 |
+
node index.mjs --notion-only --token=your_token
|
| 83 |
+
|
| 84 |
+
# Only convert existing Markdown to MDX
|
| 85 |
+
node index.mjs --mdx-only
|
| 86 |
+
|
| 87 |
+
# Custom paths
|
| 88 |
+
node index.mjs --input=my-pages.json --output=converted/ --token=your_token
|
| 89 |
+
|
| 90 |
+
Configuration File Format (pages.json):
|
| 91 |
+
{
|
| 92 |
+
"pages": [
|
| 93 |
+
{
|
| 94 |
+
"id": "your-notion-page-id",
|
| 95 |
+
"title": "Page Title",
|
| 96 |
+
"slug": "page-slug"
|
| 97 |
+
}
|
| 98 |
+
]
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
Workflow:
|
| 102 |
+
1. Notion → Markdown (with media download)
|
| 103 |
+
2. Markdown → MDX (with Astro components)
|
| 104 |
+
3. Copy to Astro content directory
|
| 105 |
+
`);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
function ensureDirectory(dir) {
|
| 109 |
+
if (!existsSync(dir)) {
|
| 110 |
+
mkdirSync(dir, { recursive: true });
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
async function cleanDirectory(dir) {
|
| 115 |
+
if (existsSync(dir)) {
|
| 116 |
+
const { execSync } = await import('child_process');
|
| 117 |
+
execSync(`rm -rf "${dir}"/*`, { stdio: 'inherit' });
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
function readPagesConfig(inputFile) {
|
| 122 |
+
try {
|
| 123 |
+
const content = readFileSync(inputFile, 'utf8');
|
| 124 |
+
return JSON.parse(content);
|
| 125 |
+
} catch (error) {
|
| 126 |
+
console.error(`❌ Error reading pages config: ${error.message}`);
|
| 127 |
+
return { pages: [] };
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
/**
|
| 132 |
+
* Create a temporary pages.json from NOTION_PAGE_ID environment variable
|
| 133 |
+
* Extracts title and generates slug from the Notion page
|
| 134 |
+
*/
|
| 135 |
+
async function createPagesConfigFromEnv(pageId, token, outputPath) {
|
| 136 |
+
try {
|
| 137 |
+
console.log('🔍 Fetching page info from Notion API...');
|
| 138 |
+
const notion = new Client({ auth: token });
|
| 139 |
+
const page = await notion.pages.retrieve({ page_id: pageId });
|
| 140 |
+
|
| 141 |
+
// Extract title
|
| 142 |
+
let title = 'Article';
|
| 143 |
+
if (page.properties.title && page.properties.title.title && page.properties.title.title.length > 0) {
|
| 144 |
+
title = page.properties.title.title[0].plain_text;
|
| 145 |
+
} else if (page.properties.Name && page.properties.Name.title && page.properties.Name.title.length > 0) {
|
| 146 |
+
title = page.properties.Name.title[0].plain_text;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// Generate slug from title
|
| 150 |
+
const slug = title
|
| 151 |
+
.toLowerCase()
|
| 152 |
+
.replace(/[^\w\s-]/g, '')
|
| 153 |
+
.replace(/\s+/g, '-')
|
| 154 |
+
.replace(/-+/g, '-')
|
| 155 |
+
.trim();
|
| 156 |
+
|
| 157 |
+
console.log(` ✅ Found page: "${title}" (slug: ${slug})`);
|
| 158 |
+
|
| 159 |
+
// Create pages config
|
| 160 |
+
const pagesConfig = {
|
| 161 |
+
pages: [{
|
| 162 |
+
id: pageId,
|
| 163 |
+
title: title,
|
| 164 |
+
slug: slug
|
| 165 |
+
}]
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
// Write to temporary file
|
| 169 |
+
writeFileSync(outputPath, JSON.stringify(pagesConfig, null, 4));
|
| 170 |
+
console.log(` ✅ Created temporary pages config`);
|
| 171 |
+
|
| 172 |
+
return pagesConfig;
|
| 173 |
+
} catch (error) {
|
| 174 |
+
console.error(`❌ Error fetching page from Notion: ${error.message}`);
|
| 175 |
+
throw error;
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
/**
|
| 180 |
+
* Final cleanup function to remove exclude tags and unused imports
|
| 181 |
+
* @param {string} content - MDX content
|
| 182 |
+
* @returns {string} - Cleaned content
|
| 183 |
+
*/
|
| 184 |
+
function cleanupExcludeTagsAndImports(content) {
|
| 185 |
+
let cleanedContent = content;
|
| 186 |
+
let removedCount = 0;
|
| 187 |
+
const removedImageVariables = new Set();
|
| 188 |
+
|
| 189 |
+
// First, extract image variable names from exclude blocks before removing them
|
| 190 |
+
const excludeBlocks = cleanedContent.match(/<exclude>[\s\S]*?<\/exclude>/g) || [];
|
| 191 |
+
excludeBlocks.forEach(match => {
|
| 192 |
+
const imageMatches = match.match(/src=\{([^}]+)\}/g);
|
| 193 |
+
if (imageMatches) {
|
| 194 |
+
imageMatches.forEach(imgMatch => {
|
| 195 |
+
const varName = imgMatch.match(/src=\{([^}]+)\}/)?.[1];
|
| 196 |
+
if (varName) {
|
| 197 |
+
removedImageVariables.add(varName);
|
| 198 |
+
}
|
| 199 |
+
});
|
| 200 |
+
}
|
| 201 |
+
});
|
| 202 |
+
|
| 203 |
+
// Remove <exclude> tags and everything between them (including multiline)
|
| 204 |
+
cleanedContent = cleanedContent.replace(/<exclude>[\s\S]*?<\/exclude>/g, (match) => {
|
| 205 |
+
removedCount++;
|
| 206 |
+
return '';
|
| 207 |
+
});
|
| 208 |
+
|
| 209 |
+
// Remove unused image imports that were only used in exclude blocks
|
| 210 |
+
if (removedImageVariables.size > 0) {
|
| 211 |
+
removedImageVariables.forEach(varName => {
|
| 212 |
+
// Check if the variable is still used elsewhere in the content after removing exclude blocks
|
| 213 |
+
const remainingUsage = cleanedContent.includes(`{${varName}}`) || cleanedContent.includes(`src={${varName}}`);
|
| 214 |
+
|
| 215 |
+
if (!remainingUsage) {
|
| 216 |
+
// Remove import lines for unused image variables
|
| 217 |
+
// Pattern: import VarName from './assets/image/filename';
|
| 218 |
+
const importPattern = new RegExp(`import\\s+${varName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\s+from\\s+['"][^'"]+['"];?\\s*`, 'g');
|
| 219 |
+
cleanedContent = cleanedContent.replace(importPattern, '');
|
| 220 |
+
console.log(` 🗑️ Removed unused import: ${varName}`);
|
| 221 |
+
}
|
| 222 |
+
});
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
if (removedCount > 0) {
|
| 226 |
+
console.log(` 🧹 Final cleanup: removed ${removedCount} exclude block(s) and ${removedImageVariables.size} unused import(s)`);
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
// Ensure there's always a blank line after imports before content starts
|
| 230 |
+
// Find the last import line and ensure there's a blank line before the next non-empty line
|
| 231 |
+
const lines = cleanedContent.split('\n');
|
| 232 |
+
let lastImportIndex = -1;
|
| 233 |
+
|
| 234 |
+
// Find the last import line
|
| 235 |
+
for (let i = 0; i < lines.length; i++) {
|
| 236 |
+
if (lines[i].trim().startsWith('import ') && lines[i].trim().endsWith(';')) {
|
| 237 |
+
lastImportIndex = i;
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
// If we found imports, ensure there's a blank line after the last one
|
| 242 |
+
if (lastImportIndex >= 0) {
|
| 243 |
+
// Find the next non-empty line after the last import
|
| 244 |
+
let nextNonEmptyIndex = lastImportIndex + 1;
|
| 245 |
+
while (nextNonEmptyIndex < lines.length && lines[nextNonEmptyIndex].trim() === '') {
|
| 246 |
+
nextNonEmptyIndex++;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
// If there's no blank line between the last import and next content, add one
|
| 250 |
+
if (nextNonEmptyIndex > lastImportIndex + 1) {
|
| 251 |
+
// There are already blank lines, this is fine
|
| 252 |
+
} else {
|
| 253 |
+
// No blank line, add one
|
| 254 |
+
lines.splice(nextNonEmptyIndex, 0, '');
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
cleanedContent = lines.join('\n');
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
return cleanedContent;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
function copyToAstroContent(outputDir) {
|
| 264 |
+
console.log('📋 Copying MDX files to Astro content directory...');
|
| 265 |
+
|
| 266 |
+
try {
|
| 267 |
+
// Ensure Astro directories exist
|
| 268 |
+
mkdirSync(dirname(ASTRO_CONTENT_PATH), { recursive: true });
|
| 269 |
+
mkdirSync(ASTRO_ASSETS_PATH, { recursive: true });
|
| 270 |
+
|
| 271 |
+
// Copy MDX file
|
| 272 |
+
const files = readdirSync(outputDir);
|
| 273 |
+
const mdxFiles = files.filter(file => file.endsWith('.mdx'));
|
| 274 |
+
if (mdxFiles.length > 0) {
|
| 275 |
+
const mdxFile = join(outputDir, mdxFiles[0]); // Take the first MDX file
|
| 276 |
+
// Read and write instead of copy to avoid EPERM issues
|
| 277 |
+
let mdxContent = readFileSync(mdxFile, 'utf8');
|
| 278 |
+
|
| 279 |
+
// Apply final cleanup to ensure no exclude tags or unused imports remain
|
| 280 |
+
mdxContent = cleanupExcludeTagsAndImports(mdxContent);
|
| 281 |
+
|
| 282 |
+
writeFileSync(ASTRO_CONTENT_PATH, mdxContent);
|
| 283 |
+
console.log(` ✅ Copied and cleaned MDX to ${ASTRO_CONTENT_PATH}`);
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
// Copy images from both media and external-images directories
|
| 287 |
+
const imageExtensions = ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.tiff', '.html'];
|
| 288 |
+
let totalImageCount = 0;
|
| 289 |
+
|
| 290 |
+
function copyImagesRecursively(dir, sourceName) {
|
| 291 |
+
if (!existsSync(dir)) return;
|
| 292 |
+
|
| 293 |
+
const files = readdirSync(dir);
|
| 294 |
+
for (const file of files) {
|
| 295 |
+
const filePath = join(dir, file);
|
| 296 |
+
const stat = statSync(filePath);
|
| 297 |
+
|
| 298 |
+
if (stat.isDirectory()) {
|
| 299 |
+
copyImagesRecursively(filePath, sourceName);
|
| 300 |
+
} else if (imageExtensions.some(ext => file.toLowerCase().endsWith(ext))) {
|
| 301 |
+
const filename = basename(filePath);
|
| 302 |
+
const destPath = join(ASTRO_ASSETS_PATH, filename);
|
| 303 |
+
|
| 304 |
+
try {
|
| 305 |
+
// Validate image by checking file size and basic structure
|
| 306 |
+
const stats = statSync(filePath);
|
| 307 |
+
if (stats.size === 0) {
|
| 308 |
+
console.log(` ⚠️ Skipping empty image: ${filename}`);
|
| 309 |
+
return;
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
// Try to copy and validate the result
|
| 313 |
+
copyFileSync(filePath, destPath);
|
| 314 |
+
|
| 315 |
+
// Additional validation - check if the copied file has reasonable size
|
| 316 |
+
const destStats = statSync(destPath);
|
| 317 |
+
if (destStats.size === 0) {
|
| 318 |
+
console.log(` ❌ Failed to copy corrupted image: ${filename}`);
|
| 319 |
+
// Remove the empty file
|
| 320 |
+
try {
|
| 321 |
+
unlinkSync(destPath);
|
| 322 |
+
} catch (e) { }
|
| 323 |
+
return;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
console.log(` ✅ Copied ${sourceName}: ${filename} (${destStats.size} bytes)`);
|
| 327 |
+
totalImageCount++;
|
| 328 |
+
} catch (error) {
|
| 329 |
+
console.log(` ❌ Failed to copy ${filename}: ${error.message}`);
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
}
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
// Copy images from media directory (Notion images)
|
| 336 |
+
const mediaDir = join(outputDir, 'media');
|
| 337 |
+
copyImagesRecursively(mediaDir, 'Notion image');
|
| 338 |
+
|
| 339 |
+
// Copy images from external-images directory (downloaded external images)
|
| 340 |
+
const externalImagesDir = join(outputDir, 'external-images');
|
| 341 |
+
copyImagesRecursively(externalImagesDir, 'external image');
|
| 342 |
+
|
| 343 |
+
if (totalImageCount > 0) {
|
| 344 |
+
console.log(` ✅ Copied ${totalImageCount} total image(s) to ${ASTRO_ASSETS_PATH}`);
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
// Always update image paths and filter problematic references in MDX file
|
| 348 |
+
if (existsSync(ASTRO_CONTENT_PATH)) {
|
| 349 |
+
const mdxContent = readFileSync(ASTRO_CONTENT_PATH, 'utf8');
|
| 350 |
+
let updatedContent = mdxContent.replace(/\.\/media\//g, './assets/image/');
|
| 351 |
+
// Remove the subdirectory from image paths since we copy images directly to assets/image/
|
| 352 |
+
updatedContent = updatedContent.replace(/\.\/assets\/image\/[^\/]+\//g, './assets/image/');
|
| 353 |
+
|
| 354 |
+
// Check which images actually exist and remove references to missing/corrupted ones
|
| 355 |
+
const imageReferences = updatedContent.match(/\.\/assets\/image\/[^\s\)]+/g) || [];
|
| 356 |
+
const existingImages = existsSync(ASTRO_ASSETS_PATH) ? readdirSync(ASTRO_ASSETS_PATH).filter(f =>
|
| 357 |
+
['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.tiff'].some(ext => f.toLowerCase().endsWith(ext))
|
| 358 |
+
) : [];
|
| 359 |
+
|
| 360 |
+
for (const imgRef of imageReferences) {
|
| 361 |
+
const filename = basename(imgRef);
|
| 362 |
+
if (!existingImages.includes(filename)) {
|
| 363 |
+
console.log(` ⚠️ Removing reference to missing/corrupted image: ${filename}`);
|
| 364 |
+
// Remove the entire image reference (both Image component and markdown syntax)
|
| 365 |
+
updatedContent = updatedContent.replace(
|
| 366 |
+
new RegExp(`<Image[^>]*src=["']${imgRef.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}["'][^>]*\/?>`, 'g'),
|
| 367 |
+
''
|
| 368 |
+
);
|
| 369 |
+
updatedContent = updatedContent.replace(
|
| 370 |
+
new RegExp(`!\\[.*?\\]\\(${imgRef.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\)`, 'g'),
|
| 371 |
+
''
|
| 372 |
+
);
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
writeFileSync(ASTRO_CONTENT_PATH, updatedContent);
|
| 377 |
+
console.log(` ✅ Updated image paths and filtered problematic references in MDX file`);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
// Copy static bibliography.bib if it exists, otherwise create empty
|
| 381 |
+
if (existsSync(STATIC_BIB_PATH)) {
|
| 382 |
+
const bibContent = readFileSync(STATIC_BIB_PATH, 'utf8');
|
| 383 |
+
writeFileSync(ASTRO_BIB_PATH, bibContent);
|
| 384 |
+
console.log(` ✅ Copied static bibliography from ${STATIC_BIB_PATH}`);
|
| 385 |
+
} else {
|
| 386 |
+
writeFileSync(ASTRO_BIB_PATH, '');
|
| 387 |
+
console.log(` ✅ Created empty bibliography (no static file found)`);
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
} catch (error) {
|
| 391 |
+
console.warn(` ⚠️ Failed to copy to Astro: ${error.message}`);
|
| 392 |
+
}
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
async function main() {
|
| 397 |
+
const args = process.argv.slice(2);
|
| 398 |
+
|
| 399 |
+
if (args.includes('--help') || args.includes('-h')) {
|
| 400 |
+
showHelp();
|
| 401 |
+
process.exit(0);
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
const config = parseArgs();
|
| 405 |
+
|
| 406 |
+
console.log('🚀 Notion to MDX Toolkit');
|
| 407 |
+
console.log('========================');
|
| 408 |
+
|
| 409 |
+
try {
|
| 410 |
+
// Prepare input config file
|
| 411 |
+
let inputConfigFile = config.input;
|
| 412 |
+
let pageIdFromEnv = null;
|
| 413 |
+
|
| 414 |
+
// If NOTION_PAGE_ID is provided via env var, create temporary pages.json
|
| 415 |
+
if (config.pageId && config.token) {
|
| 416 |
+
console.log('✨ Using NOTION_PAGE_ID from environment variable');
|
| 417 |
+
const tempConfigPath = join(config.output, '.temp-pages.json');
|
| 418 |
+
ensureDirectory(config.output);
|
| 419 |
+
await createPagesConfigFromEnv(config.pageId, config.token, tempConfigPath);
|
| 420 |
+
inputConfigFile = tempConfigPath;
|
| 421 |
+
pageIdFromEnv = config.pageId;
|
| 422 |
+
} else if (!existsSync(config.input)) {
|
| 423 |
+
console.error(`❌ No NOTION_PAGE_ID environment variable and no pages.json found at: ${config.input}`);
|
| 424 |
+
console.log('💡 Either set NOTION_PAGE_ID env var or create input/pages.json');
|
| 425 |
+
process.exit(1);
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
// Always clean output directory to avoid conflicts with previous imports
|
| 429 |
+
console.log('🧹 Cleaning output directory to avoid conflicts...');
|
| 430 |
+
await cleanDirectory(config.output);
|
| 431 |
+
|
| 432 |
+
// Clean assets/image directory and ensure proper permissions
|
| 433 |
+
console.log('🧹 Cleaning assets/image directory and setting permissions...');
|
| 434 |
+
if (existsSync(ASTRO_ASSETS_PATH)) {
|
| 435 |
+
await cleanDirectory(ASTRO_ASSETS_PATH);
|
| 436 |
+
} else {
|
| 437 |
+
ensureDirectory(ASTRO_ASSETS_PATH);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
// Ensure proper permissions for assets directory
|
| 441 |
+
const { execSync } = await import('child_process');
|
| 442 |
+
try {
|
| 443 |
+
execSync(`chmod -R 755 "${ASTRO_ASSETS_PATH}"`, { stdio: 'inherit' });
|
| 444 |
+
console.log(' ✅ Set permissions for assets/image directory');
|
| 445 |
+
} catch (error) {
|
| 446 |
+
console.log(' ⚠️ Could not set permissions (non-critical):', error.message);
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
if (config.mdxOnly) {
|
| 450 |
+
// Only convert existing Markdown to MDX
|
| 451 |
+
console.log('📝 MDX conversion only mode');
|
| 452 |
+
await convertToMdx(config.output, config.output);
|
| 453 |
+
copyToAstroContent(config.output);
|
| 454 |
+
|
| 455 |
+
} else if (config.notionOnly) {
|
| 456 |
+
// Only convert Notion to Markdown
|
| 457 |
+
console.log('📄 Notion conversion only mode');
|
| 458 |
+
await convertNotionToMarkdown(inputConfigFile, config.output, config.token);
|
| 459 |
+
|
| 460 |
+
} else {
|
| 461 |
+
// Full workflow
|
| 462 |
+
console.log('🔄 Full conversion workflow');
|
| 463 |
+
|
| 464 |
+
// Step 1: Convert Notion to Markdown
|
| 465 |
+
console.log('\n📄 Step 1: Converting Notion pages to Markdown...');
|
| 466 |
+
await convertNotionToMarkdown(inputConfigFile, config.output, config.token);
|
| 467 |
+
|
| 468 |
+
// Step 2: Convert Markdown to MDX with Notion metadata
|
| 469 |
+
console.log('\n📝 Step 2: Converting Markdown to MDX...');
|
| 470 |
+
const pagesConfig = readPagesConfig(inputConfigFile);
|
| 471 |
+
const firstPage = pagesConfig.pages && pagesConfig.pages.length > 0 ? pagesConfig.pages[0] : null;
|
| 472 |
+
const pageId = pageIdFromEnv || (firstPage ? firstPage.id : null);
|
| 473 |
+
await convertToMdx(config.output, config.output, pageId, config.token);
|
| 474 |
+
|
| 475 |
+
// Step 3: Copy to Astro content directory
|
| 476 |
+
console.log('\n📋 Step 3: Copying to Astro content directory...');
|
| 477 |
+
copyToAstroContent(config.output);
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
console.log('\n🎉 Conversion completed successfully!');
|
| 481 |
+
|
| 482 |
+
} catch (error) {
|
| 483 |
+
console.error('❌ Error:', error.message);
|
| 484 |
+
process.exit(1);
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
// Export functions for use as module
|
| 489 |
+
export { convertNotionToMarkdown, convertToMdx };
|
| 490 |
+
|
| 491 |
+
// Run CLI if called directly
|
| 492 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 493 |
+
main();
|
| 494 |
+
}
|
app/scripts/notion-importer/input/pages.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d51fba4ce9b05562f5df611a150e3cd702b487d2e608441318336556e0f248a
|
| 3 |
+
size 188
|
app/scripts/notion-importer/mdx-converter.mjs
ADDED
|
@@ -0,0 +1,863 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, statSync } from 'fs';
|
| 4 |
+
import { join, dirname, basename, extname } from 'path';
|
| 5 |
+
import { fileURLToPath } from 'url';
|
| 6 |
+
import matter from 'gray-matter';
|
| 7 |
+
import fetch from 'node-fetch';
|
| 8 |
+
|
| 9 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 10 |
+
const __dirname = dirname(__filename);
|
| 11 |
+
|
| 12 |
+
// Configuration
|
| 13 |
+
const DEFAULT_INPUT = join(__dirname, 'output');
|
| 14 |
+
const DEFAULT_OUTPUT = join(__dirname, 'output');
|
| 15 |
+
const STATIC_FRONTMATTER_PATH = join(__dirname, 'static', 'frontmatter.mdx');
|
| 16 |
+
|
| 17 |
+
function parseArgs() {
|
| 18 |
+
const args = process.argv.slice(2);
|
| 19 |
+
const config = {
|
| 20 |
+
input: DEFAULT_INPUT,
|
| 21 |
+
output: DEFAULT_OUTPUT,
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
for (const arg of args) {
|
| 25 |
+
if (arg.startsWith('--input=')) {
|
| 26 |
+
config.input = arg.substring('--input='.length);
|
| 27 |
+
} else if (arg.startsWith('--output=')) {
|
| 28 |
+
config.output = arg.substring('--output='.length);
|
| 29 |
+
} else if (arg === '--help' || arg === '-h') {
|
| 30 |
+
console.log(`
|
| 31 |
+
📝 Notion Markdown to MDX Converter
|
| 32 |
+
|
| 33 |
+
Usage:
|
| 34 |
+
node mdx-converter.mjs [options]
|
| 35 |
+
|
| 36 |
+
Options:
|
| 37 |
+
--input=PATH Input directory or file (default: ${DEFAULT_INPUT})
|
| 38 |
+
--output=PATH Output directory (default: ${DEFAULT_OUTPUT})
|
| 39 |
+
--help, -h Show this help
|
| 40 |
+
|
| 41 |
+
Examples:
|
| 42 |
+
# Convert all markdown files in output directory
|
| 43 |
+
node mdx-converter.mjs
|
| 44 |
+
|
| 45 |
+
# Convert specific file
|
| 46 |
+
node mdx-converter.mjs --input=article.md --output=converted/
|
| 47 |
+
|
| 48 |
+
# Convert directory
|
| 49 |
+
node mdx-converter.mjs --input=markdown-files/ --output=mdx-files/
|
| 50 |
+
`);
|
| 51 |
+
process.exit(0);
|
| 52 |
+
} else if (!config.input) {
|
| 53 |
+
config.input = arg;
|
| 54 |
+
} else if (!config.output) {
|
| 55 |
+
config.output = arg;
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
return config;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
/**
|
| 62 |
+
* Track which Astro components are used during transformations
|
| 63 |
+
*/
|
| 64 |
+
const usedComponents = new Set();
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
* Track individual image imports needed
|
| 68 |
+
*/
|
| 69 |
+
const imageImports = new Map(); // src -> varName
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Track external images that need to be downloaded
|
| 73 |
+
*/
|
| 74 |
+
const externalImagesToDownload = new Map(); // url -> localPath
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
* Generate a variable name from image path
|
| 78 |
+
* @param {string} src - Image source path
|
| 79 |
+
* @returns {string} - Valid variable name
|
| 80 |
+
*/
|
| 81 |
+
function generateImageVarName(src) {
|
| 82 |
+
// Extract filename without extension and make it a valid JS variable
|
| 83 |
+
const filename = src.split('/').pop().replace(/\.[^.]+$/, '');
|
| 84 |
+
return filename.replace(/[^a-zA-Z0-9]/g, '_').replace(/^[0-9]/, 'img_$&');
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
* Check if a URL is an external URL (HTTP/HTTPS)
|
| 89 |
+
* @param {string} url - URL to check
|
| 90 |
+
* @returns {boolean} - True if it's an external URL
|
| 91 |
+
*/
|
| 92 |
+
function isExternalImageUrl(url) {
|
| 93 |
+
try {
|
| 94 |
+
const urlObj = new URL(url);
|
| 95 |
+
// Just check if it's HTTP/HTTPS - we'll try to download everything
|
| 96 |
+
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
|
| 97 |
+
} catch {
|
| 98 |
+
return false;
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
/**
|
| 103 |
+
* Extract image URL from Twitter/X page
|
| 104 |
+
* @param {string} tweetUrl - URL of the tweet
|
| 105 |
+
* @returns {Promise<string|null>} - URL of the image or null if not found
|
| 106 |
+
*/
|
| 107 |
+
async function extractTwitterImageUrl(tweetUrl) {
|
| 108 |
+
try {
|
| 109 |
+
const response = await fetch(tweetUrl, {
|
| 110 |
+
headers: {
|
| 111 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 112 |
+
}
|
| 113 |
+
});
|
| 114 |
+
|
| 115 |
+
if (!response.ok) {
|
| 116 |
+
return null;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
const html = await response.text();
|
| 120 |
+
|
| 121 |
+
// Try to find image URLs in meta tags (Twitter Card)
|
| 122 |
+
const metaImageMatch = html.match(/<meta property="og:image" content="([^"]+)"/);
|
| 123 |
+
if (metaImageMatch) {
|
| 124 |
+
let imageUrl = metaImageMatch[1];
|
| 125 |
+
// Try to get the large version
|
| 126 |
+
if (imageUrl.includes('?')) {
|
| 127 |
+
imageUrl = imageUrl.split('?')[0] + '?format=jpg&name=large';
|
| 128 |
+
}
|
| 129 |
+
return imageUrl;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
// Fallback: try to find pbs.twimg.com URLs in the HTML
|
| 133 |
+
const pbsMatch = html.match(/https:\/\/pbs\.twimg\.com\/media\/([^"?]+)/);
|
| 134 |
+
if (pbsMatch) {
|
| 135 |
+
return `https://pbs.twimg.com/media/${pbsMatch[1]}?format=jpg&name=large`;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
return null;
|
| 139 |
+
} catch (error) {
|
| 140 |
+
console.log(` ⚠️ Failed to extract Twitter image: ${error.message}`);
|
| 141 |
+
return null;
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
/**
|
| 146 |
+
* Download an external URL and save it locally
|
| 147 |
+
* @param {string} imageUrl - External URL
|
| 148 |
+
* @param {string} outputDir - Directory to save the file
|
| 149 |
+
* @returns {Promise<string>} - Local path to the downloaded file
|
| 150 |
+
*/
|
| 151 |
+
async function downloadExternalImage(imageUrl, outputDir) {
|
| 152 |
+
try {
|
| 153 |
+
console.log(` 🌐 Downloading external URL: ${imageUrl}`);
|
| 154 |
+
|
| 155 |
+
// Create output directory if it doesn't exist
|
| 156 |
+
if (!existsSync(outputDir)) {
|
| 157 |
+
mkdirSync(outputDir, { recursive: true });
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
let actualImageUrl = imageUrl;
|
| 161 |
+
|
| 162 |
+
// Check if it's a Twitter/X URL
|
| 163 |
+
if (imageUrl.includes('twitter.com/') || imageUrl.includes('x.com/')) {
|
| 164 |
+
console.log(` 🐦 Detected Twitter/X URL, attempting to extract image...`);
|
| 165 |
+
const extractedUrl = await extractTwitterImageUrl(imageUrl);
|
| 166 |
+
if (extractedUrl) {
|
| 167 |
+
actualImageUrl = extractedUrl;
|
| 168 |
+
console.log(` ✅ Extracted image URL: ${extractedUrl}`);
|
| 169 |
+
} else {
|
| 170 |
+
console.log(` ⚠️ Could not automatically extract image from Twitter/X`);
|
| 171 |
+
console.log(` 💡 Manual download required:`);
|
| 172 |
+
console.log(` 1. Open ${imageUrl} in your browser`);
|
| 173 |
+
console.log(` 2. Right-click on the image and "Save image as..."`);
|
| 174 |
+
console.log(` 3. Save it to: app/src/content/assets/image/`);
|
| 175 |
+
throw new Error('Twitter/X images require manual download');
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
// Generate filename from URL
|
| 180 |
+
const urlObj = new URL(actualImageUrl);
|
| 181 |
+
const pathname = urlObj.pathname;
|
| 182 |
+
|
| 183 |
+
// Determine file extension - try to get it from URL, default to jpg
|
| 184 |
+
let extension = 'jpg';
|
| 185 |
+
if (pathname.includes('.')) {
|
| 186 |
+
const urlExtension = pathname.split('.').pop().toLowerCase();
|
| 187 |
+
if (['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp', 'tiff'].includes(urlExtension)) {
|
| 188 |
+
extension = urlExtension;
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
// Generate unique filename
|
| 193 |
+
const filename = `external_${Date.now()}_${Math.random().toString(36).substr(2, 9)}.${extension}`;
|
| 194 |
+
const localPath = join(outputDir, filename);
|
| 195 |
+
|
| 196 |
+
// Try to download the URL
|
| 197 |
+
const response = await fetch(actualImageUrl, {
|
| 198 |
+
headers: {
|
| 199 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 200 |
+
}
|
| 201 |
+
});
|
| 202 |
+
|
| 203 |
+
if (!response.ok) {
|
| 204 |
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
const buffer = await response.buffer();
|
| 208 |
+
|
| 209 |
+
// Validate that we actually got data
|
| 210 |
+
if (buffer.length === 0) {
|
| 211 |
+
throw new Error('Empty response');
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
// Validate that it's actually an image, not HTML
|
| 215 |
+
const contentType = response.headers.get('content-type');
|
| 216 |
+
if (contentType && contentType.includes('text/html')) {
|
| 217 |
+
throw new Error('Downloaded content is HTML, not an image');
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
// Save to local file
|
| 221 |
+
writeFileSync(localPath, buffer);
|
| 222 |
+
|
| 223 |
+
console.log(` ✅ Downloaded: ${filename} (${buffer.length} bytes)`);
|
| 224 |
+
return localPath;
|
| 225 |
+
|
| 226 |
+
} catch (error) {
|
| 227 |
+
console.log(` ❌ Failed to download ${imageUrl}: ${error.message}`);
|
| 228 |
+
throw error;
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
/**
|
| 233 |
+
* Process external images in content and download them
|
| 234 |
+
* @param {string} content - Markdown content
|
| 235 |
+
* @param {string} outputDir - Directory to save downloaded images
|
| 236 |
+
* @returns {Promise<string>} - Content with external images replaced by local paths
|
| 237 |
+
*/
|
| 238 |
+
async function processExternalImages(content, outputDir) {
|
| 239 |
+
console.log(' 🌐 Processing external images...');
|
| 240 |
+
|
| 241 |
+
let processedCount = 0;
|
| 242 |
+
let downloadedCount = 0;
|
| 243 |
+
|
| 244 |
+
// Find all external image URLs in markdown format: 
|
| 245 |
+
const externalImageRegex = /!\[([^\]]*)\]\(([^)]+)\)/g;
|
| 246 |
+
let match;
|
| 247 |
+
const externalImages = new Map(); // url -> alt text
|
| 248 |
+
|
| 249 |
+
// First pass: collect all external image URLs
|
| 250 |
+
while ((match = externalImageRegex.exec(content)) !== null) {
|
| 251 |
+
const alt = match[1];
|
| 252 |
+
const url = match[2];
|
| 253 |
+
|
| 254 |
+
if (isExternalImageUrl(url)) {
|
| 255 |
+
externalImages.set(url, alt);
|
| 256 |
+
console.log(` 🔍 Found external image: ${url}`);
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
if (externalImages.size === 0) {
|
| 261 |
+
console.log(' ℹ️ No external images found');
|
| 262 |
+
return content;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
// Second pass: download images and replace URLs
|
| 266 |
+
let processedContent = content;
|
| 267 |
+
|
| 268 |
+
for (const [url, alt] of externalImages) {
|
| 269 |
+
try {
|
| 270 |
+
// Download the image
|
| 271 |
+
const localPath = await downloadExternalImage(url, outputDir);
|
| 272 |
+
const relativePath = `./assets/image/${basename(localPath)}`;
|
| 273 |
+
|
| 274 |
+
// Replace the URL in content
|
| 275 |
+
processedContent = processedContent.replace(
|
| 276 |
+
new RegExp(`!\\[${alt.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\]\\(${url.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\)`, 'g'),
|
| 277 |
+
``
|
| 278 |
+
);
|
| 279 |
+
|
| 280 |
+
downloadedCount++;
|
| 281 |
+
processedCount++;
|
| 282 |
+
|
| 283 |
+
} catch (error) {
|
| 284 |
+
console.log(` ⚠️ Skipping external image due to download failure: ${url}`);
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
if (downloadedCount > 0) {
|
| 289 |
+
console.log(` ✅ Downloaded ${downloadedCount} external image(s)`);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
return processedContent;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
/**
|
| 296 |
+
* Detect and track Astro components used in the content
|
| 297 |
+
* @param {string} content - MDX content
|
| 298 |
+
*/
|
| 299 |
+
function detectAstroComponents(content) {
|
| 300 |
+
console.log(' 🔍 Detecting Astro components in content...');
|
| 301 |
+
|
| 302 |
+
let detectedCount = 0;
|
| 303 |
+
|
| 304 |
+
// Known Astro components that should be auto-imported
|
| 305 |
+
const knownComponents = [
|
| 306 |
+
'HtmlEmbed', 'Image', 'Note', 'Sidenote', 'Wide', 'FullWidth',
|
| 307 |
+
'Accordion', 'Quote', 'Reference', 'Glossary', 'Stack', 'ThemeToggle',
|
| 308 |
+
'RawHtml', 'HfUser'
|
| 309 |
+
];
|
| 310 |
+
|
| 311 |
+
// Find all JSX elements that look like Astro components
|
| 312 |
+
// Pattern: <ComponentName ... />
|
| 313 |
+
const componentMatches = content.match(/<([A-Z][a-zA-Z0-9]*)\s*[^>]*\/?>/g);
|
| 314 |
+
|
| 315 |
+
if (componentMatches) {
|
| 316 |
+
for (const match of componentMatches) {
|
| 317 |
+
// Extract component name from the JSX element
|
| 318 |
+
const componentMatch = match.match(/<([A-Z][a-zA-Z0-9]*)/);
|
| 319 |
+
if (componentMatch) {
|
| 320 |
+
const componentName = componentMatch[1];
|
| 321 |
+
|
| 322 |
+
// Only track known Astro components (skip HTML elements)
|
| 323 |
+
if (knownComponents.includes(componentName) && !usedComponents.has(componentName)) {
|
| 324 |
+
usedComponents.add(componentName);
|
| 325 |
+
detectedCount++;
|
| 326 |
+
console.log(` 📦 Found component: ${componentName}`);
|
| 327 |
+
}
|
| 328 |
+
}
|
| 329 |
+
}
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
if (detectedCount > 0) {
|
| 333 |
+
console.log(` ✅ Detected ${detectedCount} new Astro component(s)`);
|
| 334 |
+
} else {
|
| 335 |
+
console.log(` ℹ️ No new Astro components detected`);
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
/**
|
| 340 |
+
* Add required component imports to the frontmatter
|
| 341 |
+
* @param {string} content - MDX content
|
| 342 |
+
* @returns {string} - Content with component imports
|
| 343 |
+
*/
|
| 344 |
+
function addComponentImports(content) {
|
| 345 |
+
console.log(' 📦 Adding component and image imports...');
|
| 346 |
+
|
| 347 |
+
let imports = [];
|
| 348 |
+
|
| 349 |
+
// Add component imports
|
| 350 |
+
if (usedComponents.size > 0) {
|
| 351 |
+
const componentImports = Array.from(usedComponents)
|
| 352 |
+
.map(component => `import ${component} from '../components/${component}.astro';`);
|
| 353 |
+
imports.push(...componentImports);
|
| 354 |
+
console.log(` ✅ Importing components: ${Array.from(usedComponents).join(', ')}`);
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
// Add image imports
|
| 358 |
+
if (imageImports.size > 0) {
|
| 359 |
+
const imageImportStatements = Array.from(imageImports.entries())
|
| 360 |
+
.map(([src, varName]) => `import ${varName} from '${src}';`);
|
| 361 |
+
imports.push(...imageImportStatements);
|
| 362 |
+
console.log(` ✅ Importing ${imageImports.size} image(s)`);
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
if (imports.length === 0) {
|
| 366 |
+
console.log(' ℹ️ No imports needed');
|
| 367 |
+
return content;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
const importBlock = imports.join('\n');
|
| 371 |
+
|
| 372 |
+
// Insert imports after frontmatter
|
| 373 |
+
const frontmatterEnd = content.indexOf('---', 3) + 3;
|
| 374 |
+
if (frontmatterEnd > 2) {
|
| 375 |
+
return content.slice(0, frontmatterEnd) + '\n\n' + importBlock + '\n\n' + content.slice(frontmatterEnd);
|
| 376 |
+
} else {
|
| 377 |
+
// No frontmatter, add at beginning
|
| 378 |
+
return importBlock + '\n\n' + content;
|
| 379 |
+
}
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
/**
|
| 384 |
+
* Load static frontmatter from file
|
| 385 |
+
* @returns {object} - Static frontmatter data
|
| 386 |
+
*/
|
| 387 |
+
function loadStaticFrontmatter() {
|
| 388 |
+
try {
|
| 389 |
+
if (existsSync(STATIC_FRONTMATTER_PATH)) {
|
| 390 |
+
const staticContent = readFileSync(STATIC_FRONTMATTER_PATH, 'utf8');
|
| 391 |
+
const { data } = matter(staticContent);
|
| 392 |
+
console.log(' ✅ Loaded static frontmatter from file');
|
| 393 |
+
return data;
|
| 394 |
+
}
|
| 395 |
+
console.log(' ℹ️ No static frontmatter file found');
|
| 396 |
+
return {};
|
| 397 |
+
} catch (error) {
|
| 398 |
+
console.log(` ⚠️ Failed to load static frontmatter: ${error.message}`);
|
| 399 |
+
return {};
|
| 400 |
+
}
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
/**
|
| 404 |
+
* Ensure proper frontmatter for MDX using static file first, then existing data
|
| 405 |
+
* @param {string} content - MDX content
|
| 406 |
+
* @param {string} pageId - Notion page ID (optional, kept for compatibility but ignored)
|
| 407 |
+
* @param {string} notionToken - Notion API token (optional, kept for compatibility but ignored)
|
| 408 |
+
* @returns {string} - Content with proper frontmatter
|
| 409 |
+
*/
|
| 410 |
+
async function ensureFrontmatter(content, pageId = null, notionToken = null) {
|
| 411 |
+
console.log(' 📄 Ensuring proper frontmatter...');
|
| 412 |
+
|
| 413 |
+
// Load static frontmatter first (highest priority)
|
| 414 |
+
const staticData = loadStaticFrontmatter();
|
| 415 |
+
|
| 416 |
+
if (!content.startsWith('---')) {
|
| 417 |
+
// No frontmatter in content, use static + basic defaults
|
| 418 |
+
let baseData = { ...staticData };
|
| 419 |
+
|
| 420 |
+
// Add basic defaults for required fields if not in static
|
| 421 |
+
if (!baseData.title) baseData.title = 'Article';
|
| 422 |
+
if (!baseData.published) {
|
| 423 |
+
baseData.published = new Date().toLocaleDateString('en-US', {
|
| 424 |
+
year: 'numeric',
|
| 425 |
+
month: 'short',
|
| 426 |
+
day: '2-digit'
|
| 427 |
+
});
|
| 428 |
+
}
|
| 429 |
+
if (baseData.tableOfContentsAutoCollapse === undefined) {
|
| 430 |
+
baseData.tableOfContentsAutoCollapse = true;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
const frontmatter = matter.stringify('', baseData);
|
| 434 |
+
console.log(' ✅ Applied static frontmatter to content without frontmatter');
|
| 435 |
+
return frontmatter + content;
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
// Parse existing frontmatter and merge with static (static takes priority)
|
| 439 |
+
try {
|
| 440 |
+
const { data: existingData, content: body } = matter(content);
|
| 441 |
+
|
| 442 |
+
// Merge: existing data first, then static data overrides
|
| 443 |
+
const mergedData = { ...existingData, ...staticData };
|
| 444 |
+
|
| 445 |
+
// Ensure required fields if still missing after merge
|
| 446 |
+
if (!mergedData.title) mergedData.title = 'Article';
|
| 447 |
+
if (!mergedData.published) {
|
| 448 |
+
mergedData.published = new Date().toLocaleDateString('en-US', {
|
| 449 |
+
year: 'numeric',
|
| 450 |
+
month: 'short',
|
| 451 |
+
day: '2-digit'
|
| 452 |
+
});
|
| 453 |
+
}
|
| 454 |
+
if (mergedData.tableOfContentsAutoCollapse === undefined) {
|
| 455 |
+
mergedData.tableOfContentsAutoCollapse = true;
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
const enhancedContent = matter.stringify(body, mergedData);
|
| 459 |
+
console.log(' ✅ Merged static and existing frontmatter');
|
| 460 |
+
return enhancedContent;
|
| 461 |
+
} catch (error) {
|
| 462 |
+
console.log(' ⚠️ Could not parse frontmatter, keeping as is');
|
| 463 |
+
return content;
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
/**
|
| 468 |
+
* Generate basic frontmatter
|
| 469 |
+
* @returns {string} - Basic frontmatter
|
| 470 |
+
*/
|
| 471 |
+
function generateBasicFrontmatter() {
|
| 472 |
+
const currentDate = new Date().toLocaleDateString('en-US', {
|
| 473 |
+
year: 'numeric',
|
| 474 |
+
month: 'short',
|
| 475 |
+
day: '2-digit'
|
| 476 |
+
});
|
| 477 |
+
return `---
|
| 478 |
+
title: "Notion Article"
|
| 479 |
+
published: "${currentDate}"
|
| 480 |
+
tableOfContentsAutoCollapse: true
|
| 481 |
+
---
|
| 482 |
+
|
| 483 |
+
`;
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
/**
|
| 488 |
+
* Check if a line is a table line
|
| 489 |
+
* @param {string} line - Line to check
|
| 490 |
+
* @returns {boolean} - True if it's a table line
|
| 491 |
+
*/
|
| 492 |
+
function isTableLine(line) {
|
| 493 |
+
const trimmed = line.trim();
|
| 494 |
+
return trimmed.startsWith('|') && trimmed.endsWith('|');
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
/**
|
| 498 |
+
* Check if a line is a list item
|
| 499 |
+
* @param {string} line - Line to check
|
| 500 |
+
* @returns {boolean} - True if it's a list item
|
| 501 |
+
*/
|
| 502 |
+
function isListItem(line) {
|
| 503 |
+
const trimmed = line.trim();
|
| 504 |
+
// Match: * -, + (bullet points) or 1. 2. 3. (numbered lists)
|
| 505 |
+
return /^\s*[\*\-\+]\s/.test(trimmed) || /^\s*\d+\.\s/.test(trimmed);
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
/**
|
| 509 |
+
* Add a blank line after each markdown table and list
|
| 510 |
+
* @param {string} content - MDX content
|
| 511 |
+
* @returns {string} - Content with blank lines after tables and lists
|
| 512 |
+
*/
|
| 513 |
+
function addBlankLineAfterTablesAndLists(content) {
|
| 514 |
+
console.log(' 📋 Adding blank lines after tables and lists...');
|
| 515 |
+
|
| 516 |
+
let addedTableCount = 0;
|
| 517 |
+
let addedListCount = 0;
|
| 518 |
+
const lines = content.split('\n');
|
| 519 |
+
const result = [];
|
| 520 |
+
|
| 521 |
+
for (let i = 0; i < lines.length; i++) {
|
| 522 |
+
result.push(lines[i]);
|
| 523 |
+
|
| 524 |
+
// Check if current line is the end of a table
|
| 525 |
+
if (isTableLine(lines[i])) {
|
| 526 |
+
// Look ahead to see if this is the last line of a table
|
| 527 |
+
let isLastTableLine = false;
|
| 528 |
+
|
| 529 |
+
// Check if next line is empty or doesn't start with |
|
| 530 |
+
if (i + 1 >= lines.length ||
|
| 531 |
+
lines[i + 1].trim() === '' ||
|
| 532 |
+
!isTableLine(lines[i + 1])) {
|
| 533 |
+
|
| 534 |
+
// Look back to find if we're actually inside a table
|
| 535 |
+
let tableLineCount = 0;
|
| 536 |
+
for (let j = i; j >= 0 && isTableLine(lines[j]); j--) {
|
| 537 |
+
tableLineCount++;
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
// Only add blank line if we found at least 2 table lines (making it a real table)
|
| 541 |
+
if (tableLineCount >= 2) {
|
| 542 |
+
isLastTableLine = true;
|
| 543 |
+
}
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
if (isLastTableLine) {
|
| 547 |
+
addedTableCount++;
|
| 548 |
+
result.push(''); // Add blank line
|
| 549 |
+
}
|
| 550 |
+
}
|
| 551 |
+
// Check if current line is the end of a list
|
| 552 |
+
else if (isListItem(lines[i])) {
|
| 553 |
+
// Look ahead to see if this is the last line of a list
|
| 554 |
+
let isLastListItem = false;
|
| 555 |
+
|
| 556 |
+
// Check if next line is empty or doesn't start with list marker
|
| 557 |
+
if (i + 1 >= lines.length ||
|
| 558 |
+
lines[i + 1].trim() === '' ||
|
| 559 |
+
!isListItem(lines[i + 1])) {
|
| 560 |
+
isLastListItem = true;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
if (isLastListItem) {
|
| 564 |
+
addedListCount++;
|
| 565 |
+
result.push(''); // Add blank line
|
| 566 |
+
}
|
| 567 |
+
}
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
if (addedTableCount > 0 || addedListCount > 0) {
|
| 571 |
+
console.log(` ✅ Added blank line after ${addedTableCount} table(s) and ${addedListCount} list(s)`);
|
| 572 |
+
} else {
|
| 573 |
+
console.log(' ℹ️ No tables or lists found to process');
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
return result.join('\n');
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
/**
|
| 580 |
+
* Transform markdown images to Image components
|
| 581 |
+
* @param {string} content - Markdown content
|
| 582 |
+
* @returns {string} - Content with Image components
|
| 583 |
+
*/
|
| 584 |
+
function transformMarkdownImages(content) {
|
| 585 |
+
console.log(' 🖼️ Transforming markdown images to Image components...');
|
| 586 |
+
|
| 587 |
+
let transformedCount = 0;
|
| 588 |
+
|
| 589 |
+
// Transform markdown images:  -> <Image src={varName} alt="alt" />
|
| 590 |
+
content = content.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, alt, src) => {
|
| 591 |
+
transformedCount++;
|
| 592 |
+
|
| 593 |
+
// Clean up the src path - remove /media/ prefix and use relative path
|
| 594 |
+
let cleanSrc = src;
|
| 595 |
+
if (src.startsWith('/media/')) {
|
| 596 |
+
cleanSrc = src.replace('/media/', './assets/image/');
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
// Generate variable name for the image import
|
| 600 |
+
const varName = generateImageVarName(cleanSrc);
|
| 601 |
+
|
| 602 |
+
// Add to imageImports if not already present
|
| 603 |
+
if (!imageImports.has(cleanSrc)) {
|
| 604 |
+
imageImports.set(cleanSrc, varName);
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
// Extract filename for alt text if none provided
|
| 608 |
+
const finalAlt = alt || src.split('/').pop().split('.')[0];
|
| 609 |
+
|
| 610 |
+
return `<Image src={${varName}} alt="${finalAlt}" />`;
|
| 611 |
+
});
|
| 612 |
+
|
| 613 |
+
if (transformedCount > 0) {
|
| 614 |
+
console.log(` ✅ Transformed ${transformedCount} markdown image(s) to Image components with imports`);
|
| 615 |
+
} else {
|
| 616 |
+
console.log(' ℹ️ No markdown images found to transform');
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
return content;
|
| 620 |
+
}
|
| 621 |
+
|
| 622 |
+
/**
|
| 623 |
+
* Add proper spacing around Astro components
|
| 624 |
+
* @param {string} content - MDX content
|
| 625 |
+
* @returns {string} - Content with proper spacing around components
|
| 626 |
+
*/
|
| 627 |
+
function addSpacingAroundComponents(content) {
|
| 628 |
+
console.log(' 📏 Adding spacing around Astro components...');
|
| 629 |
+
|
| 630 |
+
let processedContent = content;
|
| 631 |
+
let spacingCount = 0;
|
| 632 |
+
|
| 633 |
+
// Known Astro components that should have spacing
|
| 634 |
+
const knownComponents = [
|
| 635 |
+
'HtmlEmbed', 'Image', 'Note', 'Sidenote', 'Wide', 'FullWidth',
|
| 636 |
+
'Accordion', 'Quote', 'Reference', 'Glossary', 'Stack', 'ThemeToggle',
|
| 637 |
+
'RawHtml', 'HfUser', 'Figure'
|
| 638 |
+
];
|
| 639 |
+
|
| 640 |
+
// Process each component type
|
| 641 |
+
for (const component of knownComponents) {
|
| 642 |
+
// Pattern for components with content: <Component>...</Component>
|
| 643 |
+
// Process this first to handle the complete component structure
|
| 644 |
+
const withContentPattern = new RegExp(`(<${component}[^>]*>)([\\s\\S]*?)(<\\/${component}>)`, 'g');
|
| 645 |
+
processedContent = processedContent.replace(withContentPattern, (match, openTag, content, closeTag) => {
|
| 646 |
+
spacingCount++;
|
| 647 |
+
// Ensure blank line before opening tag and after closing tag
|
| 648 |
+
// Also ensure closing tag is on its own line
|
| 649 |
+
const trimmedContent = content.trim();
|
| 650 |
+
return `\n\n${openTag}\n${trimmedContent}\n${closeTag}\n\n`;
|
| 651 |
+
});
|
| 652 |
+
|
| 653 |
+
// Pattern for self-closing components: <Component ... />
|
| 654 |
+
const selfClosingPattern = new RegExp(`(<${component}[^>]*\\/?>)`, 'g');
|
| 655 |
+
processedContent = processedContent.replace(selfClosingPattern, (match) => {
|
| 656 |
+
spacingCount++;
|
| 657 |
+
return `\n\n${match}\n\n`;
|
| 658 |
+
});
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
// Clean up excessive newlines (more than 2 consecutive)
|
| 662 |
+
processedContent = processedContent.replace(/\n{3,}/g, '\n\n');
|
| 663 |
+
|
| 664 |
+
if (spacingCount > 0) {
|
| 665 |
+
console.log(` ✅ Added spacing around ${spacingCount} component(s)`);
|
| 666 |
+
} else {
|
| 667 |
+
console.log(' ℹ️ No components found to add spacing around');
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
return processedContent;
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
/**
|
| 674 |
+
* Fix smart quotes (curly quotes) and replace them with straight quotes
|
| 675 |
+
* @param {string} content - Markdown content
|
| 676 |
+
* @returns {string} - Content with fixed quotes
|
| 677 |
+
*/
|
| 678 |
+
function fixSmartQuotes(content) {
|
| 679 |
+
console.log(' ✏️ Fixing smart quotes (curly quotes)...');
|
| 680 |
+
|
| 681 |
+
let fixedCount = 0;
|
| 682 |
+
const originalContent = content;
|
| 683 |
+
|
| 684 |
+
// Replace opening smart double quotes (\u201C) with straight quotes (")
|
| 685 |
+
content = content.replace(/\u201C/g, '"');
|
| 686 |
+
|
| 687 |
+
// Replace closing smart double quotes (\u201D) with straight quotes (")
|
| 688 |
+
content = content.replace(/\u201D/g, '"');
|
| 689 |
+
|
| 690 |
+
// Replace opening smart single quotes (\u2018) with straight quotes (')
|
| 691 |
+
content = content.replace(/\u2018/g, "'");
|
| 692 |
+
|
| 693 |
+
// Replace closing smart single quotes (\u2019) with straight quotes (')
|
| 694 |
+
content = content.replace(/\u2019/g, "'");
|
| 695 |
+
|
| 696 |
+
// Count the number of replacements made
|
| 697 |
+
fixedCount = 0;
|
| 698 |
+
for (let i = 0; i < originalContent.length; i++) {
|
| 699 |
+
const char = originalContent[i];
|
| 700 |
+
if (char === '\u201C' || char === '\u201D' || char === '\u2018' || char === '\u2019') {
|
| 701 |
+
fixedCount++;
|
| 702 |
+
}
|
| 703 |
+
}
|
| 704 |
+
|
| 705 |
+
if (fixedCount > 0) {
|
| 706 |
+
console.log(` ✅ Fixed ${fixedCount} smart quote(s)`);
|
| 707 |
+
} else {
|
| 708 |
+
console.log(' ℹ️ No smart quotes found');
|
| 709 |
+
}
|
| 710 |
+
|
| 711 |
+
return content;
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
/**
|
| 715 |
+
* Main MDX processing function that applies all transformations
|
| 716 |
+
* @param {string} content - Raw Markdown content
|
| 717 |
+
* @param {string} pageId - Notion page ID (optional)
|
| 718 |
+
* @param {string} notionToken - Notion API token (optional)
|
| 719 |
+
* @param {string} outputDir - Output directory for downloaded images (optional)
|
| 720 |
+
* @returns {string} - Processed MDX content compatible with Astro
|
| 721 |
+
*/
|
| 722 |
+
async function processMdxContent(content, pageId = null, notionToken = null, outputDir = null) {
|
| 723 |
+
console.log('🔧 Processing for Astro MDX compatibility...');
|
| 724 |
+
|
| 725 |
+
// Clear previous tracking
|
| 726 |
+
usedComponents.clear();
|
| 727 |
+
imageImports.clear();
|
| 728 |
+
externalImagesToDownload.clear();
|
| 729 |
+
|
| 730 |
+
let processedContent = content;
|
| 731 |
+
|
| 732 |
+
// Fix smart quotes first
|
| 733 |
+
processedContent = fixSmartQuotes(processedContent);
|
| 734 |
+
|
| 735 |
+
// Process external images first (before other transformations)
|
| 736 |
+
if (outputDir) {
|
| 737 |
+
// Create a temporary external images directory in the output folder
|
| 738 |
+
const externalImagesDir = join(outputDir, 'external-images');
|
| 739 |
+
processedContent = await processExternalImages(processedContent, externalImagesDir);
|
| 740 |
+
}
|
| 741 |
+
|
| 742 |
+
// Apply essential steps only
|
| 743 |
+
processedContent = await ensureFrontmatter(processedContent, pageId, notionToken);
|
| 744 |
+
|
| 745 |
+
// Add blank lines after tables and lists
|
| 746 |
+
processedContent = addBlankLineAfterTablesAndLists(processedContent);
|
| 747 |
+
|
| 748 |
+
// Transform markdown images to Image components
|
| 749 |
+
processedContent = transformMarkdownImages(processedContent);
|
| 750 |
+
|
| 751 |
+
// Add spacing around Astro components
|
| 752 |
+
processedContent = addSpacingAroundComponents(processedContent);
|
| 753 |
+
|
| 754 |
+
// Detect Astro components used in the content before adding imports
|
| 755 |
+
detectAstroComponents(processedContent);
|
| 756 |
+
|
| 757 |
+
// Add component imports at the end
|
| 758 |
+
processedContent = addComponentImports(processedContent);
|
| 759 |
+
|
| 760 |
+
return processedContent;
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
/**
|
| 764 |
+
* Convert a single markdown file to MDX
|
| 765 |
+
* @param {string} inputFile - Input markdown file
|
| 766 |
+
* @param {string} outputDir - Output directory
|
| 767 |
+
* @param {string} pageId - Notion page ID (optional)
|
| 768 |
+
* @param {string} notionToken - Notion API token (optional)
|
| 769 |
+
*/
|
| 770 |
+
async function convertFileToMdx(inputFile, outputDir, pageId = null, notionToken = null) {
|
| 771 |
+
const filename = basename(inputFile, '.md');
|
| 772 |
+
const outputFile = join(outputDir, `${filename}.mdx`);
|
| 773 |
+
|
| 774 |
+
console.log(`📝 Converting: ${basename(inputFile)} → ${basename(outputFile)}`);
|
| 775 |
+
|
| 776 |
+
try {
|
| 777 |
+
const markdownContent = readFileSync(inputFile, 'utf8');
|
| 778 |
+
const mdxContent = await processMdxContent(markdownContent, pageId, notionToken, outputDir);
|
| 779 |
+
writeFileSync(outputFile, mdxContent);
|
| 780 |
+
|
| 781 |
+
console.log(` ✅ Converted: ${outputFile}`);
|
| 782 |
+
|
| 783 |
+
// Show file size
|
| 784 |
+
const inputSize = Math.round(markdownContent.length / 1024);
|
| 785 |
+
const outputSize = Math.round(mdxContent.length / 1024);
|
| 786 |
+
console.log(` 📊 Input: ${inputSize}KB → Output: ${outputSize}KB`);
|
| 787 |
+
|
| 788 |
+
} catch (error) {
|
| 789 |
+
console.error(` ❌ Failed to convert ${inputFile}: ${error.message}`);
|
| 790 |
+
}
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
/**
|
| 794 |
+
* Convert all markdown files in a directory to MDX
|
| 795 |
+
* @param {string} inputPath - Input path (file or directory)
|
| 796 |
+
* @param {string} outputDir - Output directory
|
| 797 |
+
* @param {string} pageId - Notion page ID (optional)
|
| 798 |
+
* @param {string} notionToken - Notion API token (optional)
|
| 799 |
+
*/
|
| 800 |
+
async function convertToMdx(inputPath, outputDir, pageId = null, notionToken = null) {
|
| 801 |
+
console.log('📝 Notion Markdown to Astro MDX Converter');
|
| 802 |
+
console.log(`📁 Input: ${inputPath}`);
|
| 803 |
+
console.log(`📁 Output: ${outputDir}`);
|
| 804 |
+
|
| 805 |
+
// Check if input exists
|
| 806 |
+
if (!existsSync(inputPath)) {
|
| 807 |
+
console.error(`❌ Input not found: ${inputPath}`);
|
| 808 |
+
process.exit(1);
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
try {
|
| 812 |
+
// Ensure output directory exists
|
| 813 |
+
if (!existsSync(outputDir)) {
|
| 814 |
+
mkdirSync(outputDir, { recursive: true });
|
| 815 |
+
}
|
| 816 |
+
|
| 817 |
+
let filesToConvert = [];
|
| 818 |
+
|
| 819 |
+
if (statSync(inputPath).isDirectory()) {
|
| 820 |
+
// Convert all .md files in directory
|
| 821 |
+
const files = readdirSync(inputPath);
|
| 822 |
+
filesToConvert = files
|
| 823 |
+
.filter(file => file.endsWith('.md') && !file.includes('.raw.md'))
|
| 824 |
+
.map(file => join(inputPath, file));
|
| 825 |
+
} else if (inputPath.endsWith('.md')) {
|
| 826 |
+
// Convert single file
|
| 827 |
+
filesToConvert = [inputPath];
|
| 828 |
+
} else {
|
| 829 |
+
console.error('❌ Input must be a .md file or directory containing .md files');
|
| 830 |
+
process.exit(1);
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
if (filesToConvert.length === 0) {
|
| 834 |
+
console.log('ℹ️ No .md files found to convert');
|
| 835 |
+
return;
|
| 836 |
+
}
|
| 837 |
+
|
| 838 |
+
console.log(`🔄 Found ${filesToConvert.length} file(s) to convert`);
|
| 839 |
+
|
| 840 |
+
// Convert each file
|
| 841 |
+
for (const file of filesToConvert) {
|
| 842 |
+
await convertFileToMdx(file, outputDir, pageId, notionToken);
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
+
console.log(`✅ Conversion completed! ${filesToConvert.length} file(s) processed`);
|
| 846 |
+
|
| 847 |
+
} catch (error) {
|
| 848 |
+
console.error('❌ Conversion failed:', error.message);
|
| 849 |
+
process.exit(1);
|
| 850 |
+
}
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
export { convertToMdx };
|
| 854 |
+
|
| 855 |
+
function main() {
|
| 856 |
+
const config = parseArgs();
|
| 857 |
+
convertToMdx(config.input, config.output);
|
| 858 |
+
console.log('🎉 MDX conversion completed!');
|
| 859 |
+
}
|
| 860 |
+
|
| 861 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 862 |
+
main();
|
| 863 |
+
}
|
app/scripts/notion-importer/notion-converter.mjs
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env node
|
| 2 |
+
|
| 3 |
+
import { config } from 'dotenv';
|
| 4 |
+
import { Client } from '@notionhq/client';
|
| 5 |
+
import { NotionConverter } from 'notion-to-md';
|
| 6 |
+
import { DefaultExporter } from 'notion-to-md/plugins/exporter';
|
| 7 |
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
| 8 |
+
import { join, dirname, basename } from 'path';
|
| 9 |
+
import { fileURLToPath } from 'url';
|
| 10 |
+
import { postProcessMarkdown } from './post-processor.mjs';
|
| 11 |
+
|
| 12 |
+
// Load environment variables from .env file (but don't override existing ones)
|
| 13 |
+
config({ override: false });
|
| 14 |
+
|
| 15 |
+
const __filename = fileURLToPath(import.meta.url);
|
| 16 |
+
const __dirname = dirname(__filename);
|
| 17 |
+
|
| 18 |
+
// Configuration
|
| 19 |
+
const DEFAULT_INPUT = join(__dirname, 'input', 'pages.json');
|
| 20 |
+
const DEFAULT_OUTPUT = join(__dirname, 'output');
|
| 21 |
+
|
| 22 |
+
function parseArgs() {
|
| 23 |
+
const args = process.argv.slice(2);
|
| 24 |
+
const config = {
|
| 25 |
+
input: DEFAULT_INPUT,
|
| 26 |
+
output: DEFAULT_OUTPUT,
|
| 27 |
+
clean: false,
|
| 28 |
+
token: process.env.NOTION_TOKEN
|
| 29 |
+
};
|
| 30 |
+
|
| 31 |
+
for (const arg of args) {
|
| 32 |
+
if (arg.startsWith('--input=')) {
|
| 33 |
+
config.input = arg.split('=')[1];
|
| 34 |
+
} else if (arg.startsWith('--output=')) {
|
| 35 |
+
config.output = arg.split('=')[1];
|
| 36 |
+
} else if (arg.startsWith('--token=')) {
|
| 37 |
+
config.token = arg.split('=')[1];
|
| 38 |
+
} else if (arg === '--clean') {
|
| 39 |
+
config.clean = true;
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
return config;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
function ensureDirectory(dir) {
|
| 47 |
+
if (!existsSync(dir)) {
|
| 48 |
+
mkdirSync(dir, { recursive: true });
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
function loadPagesConfig(configFile) {
|
| 53 |
+
if (!existsSync(configFile)) {
|
| 54 |
+
console.error(`❌ Configuration file not found: ${configFile}`);
|
| 55 |
+
console.log('📝 Create a pages.json file with your Notion page IDs:');
|
| 56 |
+
console.log(`
|
| 57 |
+
{
|
| 58 |
+
"pages": [
|
| 59 |
+
{
|
| 60 |
+
"id": "your-notion-page-id-1",
|
| 61 |
+
"title": "Page Title 1",
|
| 62 |
+
"slug": "page-1"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"id": "your-notion-page-id-2",
|
| 66 |
+
"title": "Page Title 2",
|
| 67 |
+
"slug": "page-2"
|
| 68 |
+
}
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
`);
|
| 72 |
+
process.exit(1);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
try {
|
| 76 |
+
const config = JSON.parse(readFileSync(configFile, 'utf8'));
|
| 77 |
+
return config.pages || [];
|
| 78 |
+
} catch (error) {
|
| 79 |
+
console.error(`❌ Error reading configuration: ${error.message}`);
|
| 80 |
+
process.exit(1);
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
/**
|
| 85 |
+
* Convert a single Notion page to Markdown with advanced media handling
|
| 86 |
+
* @param {Object} notion - Notion client
|
| 87 |
+
* @param {string} pageId - Notion page ID
|
| 88 |
+
* @param {string} outputDir - Output directory
|
| 89 |
+
* @param {string} pageTitle - Page title for file naming
|
| 90 |
+
* @returns {Promise<string>} - Path to generated markdown file
|
| 91 |
+
*/
|
| 92 |
+
async function convertNotionPage(notion, pageId, outputDir, pageTitle) {
|
| 93 |
+
console.log(`📄 Converting Notion page: ${pageTitle} (${pageId})`);
|
| 94 |
+
|
| 95 |
+
try {
|
| 96 |
+
// Create media directory for this page
|
| 97 |
+
const mediaDir = join(outputDir, 'media', pageId);
|
| 98 |
+
ensureDirectory(mediaDir);
|
| 99 |
+
|
| 100 |
+
// Configure the DefaultExporter to save to a file
|
| 101 |
+
const outputFile = join(outputDir, `${pageTitle}.md`);
|
| 102 |
+
const exporter = new DefaultExporter({
|
| 103 |
+
outputType: 'file',
|
| 104 |
+
outputPath: outputFile,
|
| 105 |
+
});
|
| 106 |
+
|
| 107 |
+
// Create the converter with media downloading strategy
|
| 108 |
+
const n2m = new NotionConverter(notion)
|
| 109 |
+
.withExporter(exporter)
|
| 110 |
+
// Download media to local directory with path transformation
|
| 111 |
+
.downloadMediaTo({
|
| 112 |
+
outputDir: mediaDir,
|
| 113 |
+
// Transform paths to be web-accessible
|
| 114 |
+
transformPath: (localPath) => `/media/${pageId}/${basename(localPath)}`,
|
| 115 |
+
});
|
| 116 |
+
|
| 117 |
+
// Convert the page
|
| 118 |
+
const result = await n2m.convert(pageId);
|
| 119 |
+
|
| 120 |
+
console.log(` ✅ Converted to: ${outputFile}`);
|
| 121 |
+
console.log(` 📊 Content length: ${result.content.length} characters`);
|
| 122 |
+
console.log(` 🖼️ Media saved to: ${mediaDir}`);
|
| 123 |
+
|
| 124 |
+
return outputFile;
|
| 125 |
+
|
| 126 |
+
} catch (error) {
|
| 127 |
+
console.error(` ❌ Failed to convert page ${pageId}: ${error.message}`);
|
| 128 |
+
throw error;
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/**
|
| 133 |
+
* Process Notion pages with advanced configuration
|
| 134 |
+
* @param {string} inputFile - Path to pages configuration
|
| 135 |
+
* @param {string} outputDir - Output directory
|
| 136 |
+
* @param {string} notionToken - Notion API token
|
| 137 |
+
*/
|
| 138 |
+
export async function convertNotionToMarkdown(inputFile, outputDir, notionToken) {
|
| 139 |
+
console.log('🚀 Notion to Markdown Converter');
|
| 140 |
+
console.log(`📁 Input: ${inputFile}`);
|
| 141 |
+
console.log(`📁 Output: ${outputDir}`);
|
| 142 |
+
|
| 143 |
+
// Validate Notion token
|
| 144 |
+
if (!notionToken) {
|
| 145 |
+
console.error('❌ NOTION_TOKEN not found. Please set it as environment variable or use --token=YOUR_TOKEN');
|
| 146 |
+
process.exit(1);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// Ensure output directory exists
|
| 150 |
+
ensureDirectory(outputDir);
|
| 151 |
+
|
| 152 |
+
try {
|
| 153 |
+
// Initialize Notion client
|
| 154 |
+
const notion = new Client({
|
| 155 |
+
auth: notionToken,
|
| 156 |
+
});
|
| 157 |
+
|
| 158 |
+
// Load pages configuration
|
| 159 |
+
const pages = loadPagesConfig(inputFile);
|
| 160 |
+
console.log(`📋 Found ${pages.length} page(s) to convert`);
|
| 161 |
+
|
| 162 |
+
const convertedFiles = [];
|
| 163 |
+
|
| 164 |
+
// Convert each page
|
| 165 |
+
for (const page of pages) {
|
| 166 |
+
try {
|
| 167 |
+
const outputFile = await convertNotionPage(
|
| 168 |
+
notion,
|
| 169 |
+
page.id,
|
| 170 |
+
outputDir,
|
| 171 |
+
page.slug || page.title?.toLowerCase().replace(/\s+/g, '-') || page.id
|
| 172 |
+
);
|
| 173 |
+
convertedFiles.push(outputFile);
|
| 174 |
+
} catch (error) {
|
| 175 |
+
console.error(`❌ Failed to convert page ${page.id}: ${error.message}`);
|
| 176 |
+
// Continue with other pages
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
// Post-process all converted files and create one intermediate file
|
| 181 |
+
console.log('🔧 Post-processing converted files...');
|
| 182 |
+
for (const file of convertedFiles) {
|
| 183 |
+
try {
|
| 184 |
+
// Read the raw markdown from notion-to-md
|
| 185 |
+
let rawContent = readFileSync(file, 'utf8');
|
| 186 |
+
|
| 187 |
+
// Create intermediate file: raw markdown (from notion-to-md)
|
| 188 |
+
const rawFile = file.replace('.md', '.raw.md');
|
| 189 |
+
writeFileSync(rawFile, rawContent);
|
| 190 |
+
console.log(` 📄 Created raw markdown: ${basename(rawFile)}`);
|
| 191 |
+
|
| 192 |
+
// Apply post-processing with Notion client for page inclusion
|
| 193 |
+
let processedContent = await postProcessMarkdown(rawContent, notion, notionToken);
|
| 194 |
+
writeFileSync(file, processedContent);
|
| 195 |
+
console.log(` ✅ Post-processed: ${basename(file)}`);
|
| 196 |
+
} catch (error) {
|
| 197 |
+
console.error(` ❌ Failed to post-process ${file}: ${error.message}`);
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
console.log(`✅ Conversion completed! ${convertedFiles.length} file(s) generated`);
|
| 202 |
+
|
| 203 |
+
} catch (error) {
|
| 204 |
+
console.error('❌ Conversion failed:', error.message);
|
| 205 |
+
process.exit(1);
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
function main() {
|
| 210 |
+
const config = parseArgs();
|
| 211 |
+
|
| 212 |
+
if (config.clean) {
|
| 213 |
+
console.log('🧹 Cleaning output directory...');
|
| 214 |
+
// Clean output directory logic would go here
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
convertNotionToMarkdown(config.input, config.output, config.token);
|
| 218 |
+
console.log('🎉 Notion conversion completed!');
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
// Show help if requested
|
| 222 |
+
if (process.argv.includes('--help') || process.argv.includes('-h')) {
|
| 223 |
+
console.log(`
|
| 224 |
+
🚀 Notion to Markdown Converter
|
| 225 |
+
|
| 226 |
+
Usage:
|
| 227 |
+
node notion-converter.mjs [options]
|
| 228 |
+
|
| 229 |
+
Options:
|
| 230 |
+
--input=PATH Input pages configuration file (default: input/pages.json)
|
| 231 |
+
--output=PATH Output directory (default: output/)
|
| 232 |
+
--token=TOKEN Notion API token (or set NOTION_TOKEN env var)
|
| 233 |
+
--clean Clean output directory before conversion
|
| 234 |
+
--help, -h Show this help
|
| 235 |
+
|
| 236 |
+
Environment Variables:
|
| 237 |
+
NOTION_TOKEN Your Notion integration token
|
| 238 |
+
|
| 239 |
+
Examples:
|
| 240 |
+
# Basic conversion with environment token
|
| 241 |
+
NOTION_TOKEN=your_token node notion-converter.mjs
|
| 242 |
+
|
| 243 |
+
# Custom paths and token
|
| 244 |
+
node notion-converter.mjs --input=my-pages.json --output=converted/ --token=your_token
|
| 245 |
+
|
| 246 |
+
# Clean output first
|
| 247 |
+
node notion-converter.mjs --clean
|
| 248 |
+
|
| 249 |
+
Configuration File Format (pages.json):
|
| 250 |
+
{
|
| 251 |
+
"pages": [
|
| 252 |
+
{
|
| 253 |
+
"id": "your-notion-page-id",
|
| 254 |
+
"title": "Page Title",
|
| 255 |
+
"slug": "page-slug"
|
| 256 |
+
}
|
| 257 |
+
]
|
| 258 |
+
}
|
| 259 |
+
`);
|
| 260 |
+
process.exit(0);
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
// Run CLI if called directly
|
| 264 |
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
| 265 |
+
main();
|
| 266 |
+
}
|