Skip to content

Commit

Permalink
Add new plugin to handle pdf files (#8817)
Browse files Browse the repository at this point in the history
Adds support for PDF as a content source.

Related to #8711
  • Loading branch information
AlexMunoz authored and DSchau committed Oct 9, 2018
1 parent f008878 commit aa6b506
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 0 deletions.
5 changes: 5 additions & 0 deletions packages/gatsby-transformer-pdf/.babelrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"presets": [
["../../.babel-preset.js"]
]
}
2 changes: 2 additions & 0 deletions packages/gatsby-transformer-pdf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/*.js
!index.js
34 changes: 34 additions & 0 deletions packages/gatsby-transformer-pdf/.npmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Logs
logs
*.log

# Runtime data
pids
*.pid
*.seed

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage

# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# node-waf configuration
.lock-wscript

# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release

# Dependency directory
# https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git
node_modules
*.un~
yarn.lock
src
flow-typed
coverage
decls
examples
58 changes: 58 additions & 0 deletions packages/gatsby-transformer-pdf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# gatsby-transformer-pdf

Use [pdf2Json](`https://github.com/modesty/pdf2json`) to extract textual content of pdf files.

## Install

`npm install --save gatsby-transformer-pdf`

You also need to have gatsby-source-filesystem installed and configured so it points to your files.

## How to use

```javascript
// In your gatsby-config.js
module.exports = {
plugins: [
{
resolve: `gatsby-source-filesystem`,
options: {
name: `data`,
path: `${__dirname}/src/data/`,
},
},
`gatsby-transformer-pdf`,
],
}
```

Then you'll be able to query the textual content of your pdfs files like:

```javascript
{
allPdf {
edges {
node {
content
}
}
}
}
```
Which would return:

```javascript
{
"data": {
"allPdf": {
"edges": [
{
"node": {
"content": "1 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed vel purus id tortor \r\neleifend vulputate. Integer interdum ultricies ligula, nec mattis lorem viverra ac. \r\n"
}
}
]
}
}
}
```
1 change: 1 addition & 0 deletions packages/gatsby-transformer-pdf/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
// noop
36 changes: 36 additions & 0 deletions packages/gatsby-transformer-pdf/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"name": "gatsby-transformer-pdf",
"description": "Gatsby transformer plugin for pdf",
"version": "1.0.9",
"author": "Alex Munoz <almunozdo@gmail.com>",
"bugs": {
"url": "https://github.com/gatsbyjs/gatsby/issues"
},
"dependencies": {
"@babel/runtime": "^7.0.0",
"bluebird": "^3.5.2",
"pdf2json": "^1.1.7"
},
"devDependencies": {
"@babel/cli": "^7.0.0",
"@babel/core": "^7.0.0",
"cross-env": "^5.1.4"
},
"homepage": "https://github.com/gatsbyjs/gatsby/tree/master/packages/gatsby-transformer-pdf#readme",
"keywords": [
"gatsby",
"gatsby-plugin",
"pdf"
],
"license": "MIT",
"main": "index.js",
"peerDependencies": {
"gatsby": ">=2.0.0"
},
"repository": "https://github.com/gatsbyjs/gatsby/tree/master/packages/gatsby-transformer-pdf",
"scripts": {
"build": "babel src --out-dir . --ignore **/__tests__",
"prepare": "cross-env NODE_ENV=production npm run build",
"watch": "babel -w src --out-dir . --ignore **/__tests__"
}
}
49 changes: 49 additions & 0 deletions packages/gatsby-transformer-pdf/src/gatsby-node.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
const Promise = require(`bluebird`)
const PDFParser = require(`pdf2json`)

const convertToJson = path =>
new Promise((res, rej) => {
const pdfParser = new PDFParser(this, 1)
pdfParser.loadPDF(path)
pdfParser
.on(`pdfParser_dataReady`, pdfData => {
res(pdfParser.getRawTextContent())
})
.on(`pdfParser_dataError`, errData => {
rej(`PDF to JSON conversion failed!`)
})
})

async function onCreateNode({
node,
actions,
loadNodeContent,
createNodeId,
createContentDigest,
}) {
const { createNode, createParentChildLink } = actions

// Filter out non-pdf content
if (node.extension !== `pdf`) {
return
}

let parsedContent = await convertToJson(node.absolutePath)

const pdfNode = {
id: createNodeId(`${node.id} >>> ${node.extension}`),
children: [],
parent: node.id,
internal: {
type: `pdf`,
},
}

pdfNode.content = parsedContent
pdfNode.internal.contentDigest = createContentDigest(pdfNode)

createNode(pdfNode)
createParentChildLink({ parent: node, child: pdfNode })
}

exports.onCreateNode = onCreateNode

0 comments on commit aa6b506

Please sign in to comment.