SILVERCODERS DocToText  4.0.1512
Converts DOC, XLS, XLSB, PPT, RTF, ODF (ODT, ODS, ODP), OOXML (DOCX, XLSX, PPTX), iWork (PAGES, NUMBERS, KEYNOTE), ODFXML (FODP, FODS, FODT), PDF, EML and HTML documents to plain text. Extracts metadata and annotations.
 All Classes Functions Enumerations Pages
plain_text_extractor.h
1 #ifndef DOCTOTEXT_PLAIN_TEXT_EXTRACTOR_H
2 #define DOCTOTEXT_PLAIN_TEXT_EXTRACTOR_H
3 
4 #include "attachment.h"
5 #include "exception.h"
6 #include "formatting_style.h"
7 #include "link.h"
8 #include <string>
9 #include <vector>
10 
11 namespace doctotext
12 {
13  class Metadata;
14 
39  {
40  private:
41  struct Implementation;
42  Implementation* impl;
43 
44  public:
45 
51  {
52  PARSER_AUTO,
53  PARSER_RTF,
54  PARSER_ODF_OOXML,
55  PARSER_XLS,
56  PARSER_DOC,
57  PARSER_PPT,
58  PARSER_HTML,
59  PARSER_IWORK,
60  PARSER_XLSB,
61  PARSER_PDF,
62  PARSER_TXT,
63  PARSER_EML,
64  PARSER_ODFXML
65  };
66 
72  PlainTextExtractor(ParserType parser_type = PARSER_AUTO);
73 
75 
86  void setVerboseLogging(bool verbose);
87 
95  void setLogStream(std::ostream& log_stream);
96 
103  void setFormattingStyle(const FormattingStyle& style);
104 
105  void setXmlParseMode(XmlParseMode mode);
106 
115  void setManageXmlParser(bool manage);
116 
127  ParserType parserTypeByFileExtension(const std::string& file_name);
128 
132  ParserType parserTypeByFileExtension(const char* file_name);
133 
142  bool parserTypeByFileContent(const std::string& file_name, ParserType& parser_type);
143 
147  bool parserTypeByFileContent(const char* file_name, ParserType& parser_type);
148 
155  bool parserTypeByFileContent(const char* buffer, size_t size, ParserType& parser_type);
156 
164  bool processFile(const std::string& file_name, std::string& text);
165 
171  bool processFile(const char* file_name, char*& text);
172 
179  bool processFile(const char* buffer, size_t size, char*& text);
180 
184  bool processFile(const char* buffer, size_t size, std::string& text);
185 
199  bool processFile(ParserType parser_type, bool fallback, const std::string& file_name, std::string& text);
200 
206  bool processFile(ParserType parser_type, bool fallback, const char* file_name, char*& text);
207 
214  bool processFile(ParserType parser_type, bool fallback, const char* buffer, size_t size, char*& text);
215 
219  bool processFile(ParserType parser_type, bool fallback, const char* buffer, size_t size, std::string& text);
220 
228  bool extractMetadata(const std::string& file_name, Metadata& metadata);
229 
233  bool extractMetadata(const char* file_name, Metadata& metadata);
234 
241  bool extractMetadata(const char* buffer, size_t size, Metadata& metadata);
242 
256  bool extractMetadata(ParserType parser_type, bool fallback, const std::string& file_name, Metadata& metadata);
257 
261  bool extractMetadata(ParserType parser_type, bool fallback, const char* file_name, Metadata& metadata);
262 
269  bool extractMetadata(ParserType parser_type, bool fallback, const char* buffer, size_t size, Metadata& metadata);
270 
276  size_t getNumberOfLinks() const;
277 
282  void getParsedLinks(std::vector<Link>& links) const;
283 
292  void getParsedLinks(const Link*& links, size_t& number_of_links) const;
293 
301  const Link* getParsedLinks() const;
302 
308  void getAttachments(std::vector<Attachment>& attachments) const;
309 
319  void getAttachments(const Attachment*& attachments, size_t& number_of_attachments) const;
320 
329  const Attachment* getAttachments() const;
330 
337  size_t getNumberOfAttachments() const;
338  };
339 }
340 
341 #endif
bool processFile(const std::string &file_name, std::string &text)
Definition: plain_text_extractor.h:38
PlainTextExtractor(ParserType parser_type=PARSER_AUTO)
Definition: attachment.h:16
void setLogStream(std::ostream &log_stream)
bool parserTypeByFileContent(const std::string &file_name, ParserType &parser_type)
size_t getNumberOfAttachments() const
void setVerboseLogging(bool verbose)
size_t getNumberOfLinks() const
bool extractMetadata(const std::string &file_name, Metadata &metadata)
const Link * getParsedLinks() const
ParserType parserTypeByFileExtension(const std::string &file_name)
const Attachment * getAttachments() const
void setFormattingStyle(const FormattingStyle &style)
Definition: formatting_style.h:28
ParserType
Definition: plain_text_extractor.h:50
Definition: metadata.h:10
void setManageXmlParser(bool manage)