Skip to content

Instantly share code, notes, and snippets.

@madhavpalshikar
Created September 27, 2020 12:45
Show Gist options
  • Save madhavpalshikar/96e72889c534443caefd89000b2e69b5 to your computer and use it in GitHub Desktop.
Save madhavpalshikar/96e72889c534443caefd89000b2e69b5 to your computer and use it in GitHub Desktop.
Converting Office Docs to PDF with AWS Lambda
const https = require('https');
const path = require('path');
const fs = require('fs');
var async = require('async');
const {writeFileSync} = require('fs');
const lambdafs = require('lambdafs');
const {execSync} = require('child_process');
var AWS = require('aws-sdk');
const inputPath = path.join( '/opt', 'lo.tar.br');
const outputPath = '/tmp/';
const bucketName = 'doc-conversion-test';
module.exports.handler = async (event, context) => {
console.log(execSync('ls -alh /opt').toString('utf8'));
try {
// Decompressing
let decompressed = {
file: await lambdafs.inflate(inputPath)
};
console.log('output brotli de:----', decompressed);
} catch (error) {
console.log('Error brotli de:----', error);
}
try {
console.log(execSync('ls -alh /opt').toString('utf8'));
} catch (e) {
console.log(e);
}
var body = "";
//S3 put event
body = event.Records[0].body;
console.log('s3 bucket file name from event:', body);
// get file from s3 bucket
var s3fileName = body;
var newFileName = Date.now()+'.pdf';
var s3 = new AWS.S3({apiVersion: '2006-03-01'});
var fileStream = fs.createWriteStream('/tmp/'+s3fileName);
var getObject = function(keyFile) {
return new Promise(function(success, reject) {
s3.getObject(
{ Bucket: bucketName, Key: keyFile },
function (error, data) {
if(error) {
reject(error);
} else {
success(data);
}
}
);
});
}
let fileData = await getObject(s3fileName);
try{
fs.writeFileSync('/tmp/'+s3fileName, fileData.Body);
} catch(err) {
// An error occurred
console.error('file write:', err);
}
const convertCommand = `export HOME=/tmp && /tmp/lo/instdir/program/soffice.bin --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to "pdf:writer_pdf_Export" --outdir /tmp /tmp/${s3fileName}`;
try {
console.log(execSync(convertCommand).toString('utf8'));
} catch (e) {
console.log(execSync(convertCommand).toString('utf8'));
}
console.log(execSync('ls -alh /tmp').toString('utf8'));
function uploadFile(buffer, fileName) {
return new Promise((resolve, reject) => {
s3.putObject({
Body: buffer,
Key: fileName,
Bucket: bucketName,
}, (error) => {
if (error) {
reject(error);
} else {
resolve(fileName);
}
});
});
}
let fileParts = s3fileName.substr(0, s3fileName.lastIndexOf(".")) + ".pdf";
let fileB64data = fs.readFileSync('/tmp/'+fileParts);
await uploadFile(fileB64data, 'pdf/'+fileParts);
console.log('new pdf converted and uploaded!!!');
};
@jsteigerwalt-ss
Copy link

I have tried a few different AWS Lambda OpenOffice PDF generators over the past weeks and this is the only one that has worked with current Amazon Linux 2/Lambda constraints for a wide variety of files.

@madhavpalshikar
Copy link
Author

yes, it works fine for 99%

@Smita71084
Copy link

I used this code to convert from docx to html -- worked perfectly with a few tweaks..
i also need it to convert it to pdf -- but the pdf which is getting generated is empty, not sure whats wrong.

import {execSync} from "child_process";

export class WordToHtmlConvertor {
/**

  • This function handles the s3 upload event and converts the file to the specified format ,and uploads it to S3 again
  • @param {any} event - The uploaded s3 objects bucket and key
    */
    static convertWordToHtml = async (bucketName: String, key: String) => {
    const fs = require("fs");
const params = {
	Bucket: bucketName,
	Key: key,
};
const AWS = require("aws-sdk");
const s3 = new AWS.S3();
await s3.getObject(params, function(err, data){
	if (err) {
		console.error(err.code, "-", err.message);
	}
	/*fs.writeFile('/tmp/' + key, data.Body, function(err){
		if(err) {
			console.log(err.code, "-", err.message);
		}
	});*/
	fs.writeFileSync('/tmp/'+ key, data.Body);
});
try {
	const path = require('path');
	const inputPath = path.join( '/opt', 'lo.tar.br');
	const lambdafs = require('lambdafs');

	// Decompressing
	let decompressed = {
		file: await lambdafs.inflate(inputPath)
	};
	console.log('Libre Office Brotli layer decompressed', decompressed);
} catch (error) {
	console.log('Error decompressing', error);
}

//let newFile = key.substr(0, key.lastIndexOf(".")) + ".html";
let newFile = key.substr(0, key.lastIndexOf(".")) + ".pdf";
console.log(newFile);
try {
	//console.log( execSync(`export HOME=/tmp && /tmp/lo/instdir/program/soffice.bin --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to html:HTML --outdir /tmp /tmp/TEst.docx`));
	console.log( execSync(`export HOME=/tmp && /tmp/lo/instdir/program/soffice.bin --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to "pdf:writer_pdf_Export" --outdir /tmp /tmp/TEst.docx`));
} catch (e) {
	//console.log( execSync(`export HOME=/tmp && /tmp/lo/instdir/program/soffice.bin --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to html:HTML --outdir /tmp /tmp/TEst.docx`));
	console.log( execSync(`export HOME=/tmp && /tmp/lo/instdir/program/soffice.bin --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to "pdf:writer_pdf_Export" --outdir /tmp /tmp/TEst.docx`));
}

console.log(execSync('cat /tmp/TEst.docx').toString('utf8'));
console.log(execSync('cat /tmp/TEst.pdf').toString('utf8'));
let fileData = fs.readFileSync('/tmp/' + newFile, {encoding: 'binary'});
await uploadFile(fileData, newFile).then(function (result) {
console.log("Uploaded to s3:", result);
}).catch(function (err) {
console.error("Error uploading file to S3:", err.toString());
});

console.log('New Document converted and uploaded');

function uploadFile(buffer, fileName) {
console.log("Inside upload File "+ buffer.toString() + fileName);
	return new Promise(function(resolve, reject) {
		s3.putObject(
			{
				Bucket: bucketName,
				Key: fileName,
				Body: buffer,
				//ContentType: 'text/html',
				ContentType: 'application/pdf',
			}
		).promise().then(resolve, reject);
	});
}

}
};

@Smita71084
Copy link

can you please help ?

@aherman3
Copy link

aherman3 commented Jul 7, 2021

this worked with some editing:

  • had to give s3 bucket read and write access for my lambda function in IAM
  • "body = event.Records[0].body;" returns undefined -> I just manually changed it to my filename (ex. file.docx)
    the pdf looks great, thank you!

@diveshqss
Copy link

working with a little modification for amplify
https://gist.github.com/diveshqss/1fc4a81a47db2e24f1401e50984dc075

@priyanshikathuria
Copy link

I have been using this blog (https://madhavpalshikar.medium.com/converting-office-docs-to-pdf-with-aws-lambda-372c5ac918f1) for my conversion project. I have followed all steps in this blog, up till step 6 - of uploading code.. bt i don't know how to test this now. I'm not able to upload doc and then see if it converts...Im a novice to AWS and im experimeting. so if you could help me with that, i will be beyond gratefull. Plz reach out to me at - priyanshikathuria@gmail.com

@vpatidar009
Copy link

Thanks @madhavpalshikar for sharing this. You just saved my day.

Kudos!!

@bavingtonm
Copy link

Hi

I have tried to follow the instructions on your post but every time I test the Lambda Function I get the below error from Lambda:

{
"errorType": "TypeError",
"errorMessage": "Cannot read properties of undefined (reading '0')",
"trace": [
"TypeError: Cannot read properties of undefined (reading '0')",
" at Runtime.module.exports.handler (/var/task/index/index.js:36:26)",
" at runMicrotasks ()",
" at processTicksAndRejections (node:internal/process/task_queues:96:5)"
]
}

I'm pretty sure it is causing a loop @ line 36

body = event.Records[0].body;

Does anyone have any ideas how to fix this issue please?

Thanks

M

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment