-
Notifications
You must be signed in to change notification settings - Fork 10.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add new plugin to handle pdf files (#8817)
Adds support for PDF as a content source. Related to #8711
- Loading branch information
Showing
7 changed files
with
185 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"presets": [ | ||
["../../.babel-preset.js"] | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/*.js | ||
!index.js |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Logs | ||
logs | ||
*.log | ||
|
||
# Runtime data | ||
pids | ||
*.pid | ||
*.seed | ||
|
||
# Directory for instrumented libs generated by jscoverage/JSCover | ||
lib-cov | ||
|
||
# Coverage directory used by tools like istanbul | ||
coverage | ||
|
||
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) | ||
.grunt | ||
|
||
# node-waf configuration | ||
.lock-wscript | ||
|
||
# Compiled binary addons (http://nodejs.org/api/addons.html) | ||
build/Release | ||
|
||
# Dependency directory | ||
# https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git | ||
node_modules | ||
*.un~ | ||
yarn.lock | ||
src | ||
flow-typed | ||
coverage | ||
decls | ||
examples |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# gatsby-transformer-pdf | ||
|
||
Use [pdf2Json](`https://github.com/modesty/pdf2json`) to extract textual content of pdf files. | ||
|
||
## Install | ||
|
||
`npm install --save gatsby-transformer-pdf` | ||
|
||
You also need to have gatsby-source-filesystem installed and configured so it points to your files. | ||
|
||
## How to use | ||
|
||
```javascript | ||
// In your gatsby-config.js | ||
module.exports = { | ||
plugins: [ | ||
{ | ||
resolve: `gatsby-source-filesystem`, | ||
options: { | ||
name: `data`, | ||
path: `${__dirname}/src/data/`, | ||
}, | ||
}, | ||
`gatsby-transformer-pdf`, | ||
], | ||
} | ||
``` | ||
|
||
Then you'll be able to query the textual content of your pdfs files like: | ||
|
||
```javascript | ||
{ | ||
allPdf { | ||
edges { | ||
node { | ||
content | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
Which would return: | ||
|
||
```javascript | ||
{ | ||
"data": { | ||
"allPdf": { | ||
"edges": [ | ||
{ | ||
"node": { | ||
"content": "1 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed vel purus id tortor \r\neleifend vulputate. Integer interdum ultricies ligula, nec mattis lorem viverra ac. \r\n" | ||
} | ||
} | ||
] | ||
} | ||
} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
// noop |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"name": "gatsby-transformer-pdf", | ||
"description": "Gatsby transformer plugin for pdf", | ||
"version": "1.0.9", | ||
"author": "Alex Munoz <almunozdo@gmail.com>", | ||
"bugs": { | ||
"url": "https://github.com/gatsbyjs/gatsby/issues" | ||
}, | ||
"dependencies": { | ||
"@babel/runtime": "^7.0.0", | ||
"bluebird": "^3.5.2", | ||
"pdf2json": "^1.1.7" | ||
}, | ||
"devDependencies": { | ||
"@babel/cli": "^7.0.0", | ||
"@babel/core": "^7.0.0", | ||
"cross-env": "^5.1.4" | ||
}, | ||
"homepage": "https://github.com/gatsbyjs/gatsby/tree/master/packages/gatsby-transformer-pdf#readme", | ||
"keywords": [ | ||
"gatsby", | ||
"gatsby-plugin", | ||
"pdf" | ||
], | ||
"license": "MIT", | ||
"main": "index.js", | ||
"peerDependencies": { | ||
"gatsby": ">=2.0.0" | ||
}, | ||
"repository": "https://github.com/gatsbyjs/gatsby/tree/master/packages/gatsby-transformer-pdf", | ||
"scripts": { | ||
"build": "babel src --out-dir . --ignore **/__tests__", | ||
"prepare": "cross-env NODE_ENV=production npm run build", | ||
"watch": "babel -w src --out-dir . --ignore **/__tests__" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
const Promise = require(`bluebird`) | ||
const PDFParser = require(`pdf2json`) | ||
|
||
const convertToJson = path => | ||
new Promise((res, rej) => { | ||
const pdfParser = new PDFParser(this, 1) | ||
pdfParser.loadPDF(path) | ||
pdfParser | ||
.on(`pdfParser_dataReady`, pdfData => { | ||
res(pdfParser.getRawTextContent()) | ||
}) | ||
.on(`pdfParser_dataError`, errData => { | ||
rej(`PDF to JSON conversion failed!`) | ||
}) | ||
}) | ||
|
||
async function onCreateNode({ | ||
node, | ||
actions, | ||
loadNodeContent, | ||
createNodeId, | ||
createContentDigest, | ||
}) { | ||
const { createNode, createParentChildLink } = actions | ||
|
||
// Filter out non-pdf content | ||
if (node.extension !== `pdf`) { | ||
return | ||
} | ||
|
||
let parsedContent = await convertToJson(node.absolutePath) | ||
|
||
const pdfNode = { | ||
id: createNodeId(`${node.id} >>> ${node.extension}`), | ||
children: [], | ||
parent: node.id, | ||
internal: { | ||
type: `pdf`, | ||
}, | ||
} | ||
|
||
pdfNode.content = parsedContent | ||
pdfNode.internal.contentDigest = createContentDigest(pdfNode) | ||
|
||
createNode(pdfNode) | ||
createParentChildLink({ parent: node, child: pdfNode }) | ||
} | ||
|
||
exports.onCreateNode = onCreateNode |