Skip to content

Instantly share code, notes, and snippets.

@bsdelf
Created August 17, 2014 13:38
Show Gist options
  • Save bsdelf/3e9ed789c433ac2f8802 to your computer and use it in GitHub Desktop.
Save bsdelf/3e9ed789c433ac2f8802 to your computer and use it in GitHub Desktop.
remove watermark for "pdf.th7.cn/down/files/1407/Real%20World%20OCaml.pdf"
/*
* clang++ -I/usr/local/include -pipe -std=c++11 -stdlib=libc++ -o main.cc.o -c main.cc
* clang++ -L/usr/local/lib -lpodofo -stdlib=libc++ -o b.out main.cc.o
*
*/
#include <iostream>
#include <string>
#include <list>
using namespace std;
#include <podofo/podofo.h>
void ShowObjType(const PoDoFo::PdfObject* obj) {
if (!obj->IsNull()) {
cout << obj->IsBool() << endl;
cout << obj->IsNumber() << endl;
cout << obj->IsReal() << endl;
cout << obj->IsString() << endl;
cout << obj->IsHexString() << endl;
cout << obj->IsName() << endl;
cout << obj->IsArray() << endl;
cout << obj->IsDictionary() << endl;
cout << obj->IsRawData() << endl;
cout << obj->IsNull() << endl;
cout << obj->IsReference() << endl;
}
}
int main() {
PoDoFo::PdfVecObjects objs;
PoDoFo::PdfParser parser(&objs);
parser.ParseFile("/tmp/rwo.pdf", false);
int idx = 0;
list<int> badidx;
for (const auto obj: objs) {
bool bdel = false;
// annotate
if (obj->IsDictionary()) {
const auto& d0 = obj->GetDictionary();
const auto& k0 = d0.GetKeys();
if (k0.size() == 2 &&
d0.HasKey(PoDoFo::PdfName("Length")) &&
d0.HasKey(PoDoFo::PdfName("LC"))) {
const auto stream = (PoDoFo::PdfMemStream*)obj->GetStream();
if (stream->GetLength() > 0 &&
string(stream->Get()).find("www.it-ebooks.info") != string::npos) {
bdel = true;
}
}
}
// URI link
if (obj->IsDictionary()) {
const auto& d0 = obj->GetDictionary();
if (d0.HasKey(PoDoFo::PdfName("A"))) {
const auto& k0 = d0.GetKey(PoDoFo::PdfName("A"));
if (k0->IsDictionary()) {
const auto& d1 = k0->GetDictionary();
if (d1.HasKey(PoDoFo::PdfName("URI"))) {
auto str = d1.GetKey(PoDoFo::PdfName("URI"))->GetString().GetString();
if (str == string("http://www.it-ebooks.info/")) {
bdel = true;
}
}
}
}
}
if (bdel) {
badidx.insert(badidx.begin(), idx);
}
idx++;
}
cout << "obj count:" << badidx.size() << endl;
for (auto offset: badidx) {
auto obj = objs.begin() + offset;
delete objs.RemoveObject(obj);
}
// dangle refernce
for (const auto obj: objs) {
auto prune_array = [&badidx](PoDoFo::PdfArray& arr) {
for (int i = arr.size()-1; i >= 0; --i) {
const auto& item = arr[i];
if (item.IsReference()) {
bool orphan = false;
const auto& num = item.GetReference().ObjectNumber();
for (auto offset: badidx) {
if (offset+1 == num) {
orphan = true;
break;
}
}
if (orphan) {
arr.erase(arr.begin()+i);
cout << "erased" << endl;
}
}
}
};
if (obj->IsArray()) {
auto& arr = obj->GetArray();
prune_array(arr);
}
if (obj->IsDictionary()) {
auto& d0 = obj->GetDictionary();
const auto& k0 = d0.GetKeys();
auto prune_key = [&](const string& key) {
if (d0.HasKey(PoDoFo::PdfName(key))) {
auto k1 = d0.GetKey(PoDoFo::PdfName(key));
if (k1->IsArray()) {
auto& arr = k1->GetArray();
prune_array(arr);
if (arr.empty()) {
d0.RemoveKey(PoDoFo::PdfName(key));
cout << "removed" << endl;
}
}
}
};
prune_key("Annots");
prune_key("Contents");
}
}
objs.Finish();
// save
{
auto ptrailer = parser.GetTrailer();
PoDoFo::PdfWriter writer(&objs, ptrailer);
writer.SetPdfVersion(PoDoFo::ePdfVersion_1_6);
writer.Write("out.pdf");
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment