322 lines
8.9 KiB
C++
322 lines
8.9 KiB
C++
/* Boost.Flyweight example of flyweight-based formatted text processing.
|
|
*
|
|
* Copyright 2006-2014 Joaquin M Lopez Munoz.
|
|
* Distributed under the Boost Software License, Version 1.0.
|
|
* (See accompanying file LICENSE_1_0.txt or copy at
|
|
* http://www.boost.org/LICENSE_1_0.txt)
|
|
*
|
|
* See http://www.boost.org/libs/flyweight for library home page.
|
|
*/
|
|
|
|
#include <boost/flyweight.hpp>
|
|
#include <boost/functional/hash.hpp>
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <cstdio>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#if defined(BOOST_NO_STDC_NAMESPACE)
|
|
namespace std{using ::exit;using ::tolower;}
|
|
#endif
|
|
|
|
using namespace boost::flyweights;
|
|
|
|
/* An HTML tag consists of a name and optional properties of the form
|
|
* name1=value1 ... namen=valuen. We do not need to parse the properties
|
|
* for the purposes of the program, hence they are all stored in
|
|
* html_tag_data::properties in raw form.
|
|
*/
|
|
|
|
struct html_tag_data
|
|
{
|
|
std::string name;
|
|
std::string properties;
|
|
};
|
|
|
|
bool operator==(const html_tag_data& x,const html_tag_data& y)
|
|
{
|
|
return x.name==y.name&&x.properties==y.properties;
|
|
}
|
|
|
|
/* See the portability section of Boost.Hash at
|
|
* http://boost.org/doc/html/hash/portability.html
|
|
* for an explanation of the ADL-related workarounds.
|
|
*/
|
|
|
|
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
|
|
namespace boost{
|
|
#endif
|
|
|
|
std::size_t hash_value(const html_tag_data& x)
|
|
{
|
|
std::size_t res=0;
|
|
boost::hash_combine(res,x.name);
|
|
boost::hash_combine(res,x.properties);
|
|
return res;
|
|
}
|
|
|
|
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
|
|
} /* namespace boost */
|
|
#endif
|
|
|
|
typedef flyweight<html_tag_data> html_tag;
|
|
|
|
/* parse_tag is passed an iterator positioned at the first char of
|
|
* the tag after the opening '<' and returns, if succesful, a parsed tag
|
|
* and whether it is opening (<xx>) or closing (</xx>).
|
|
*/
|
|
|
|
enum tag_type{opening,closing,failure};
|
|
|
|
struct parse_tag_res
|
|
{
|
|
parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
|
|
type(type_),tag(tag_){}
|
|
parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}
|
|
|
|
tag_type type;
|
|
html_tag tag;
|
|
};
|
|
|
|
template<typename ForwardIterator>
|
|
parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
|
|
{
|
|
html_tag_data tag;
|
|
std::string buf;
|
|
bool in_quote=false;
|
|
for(ForwardIterator it=first;it!=last;){
|
|
char ch=*it++;
|
|
if(ch=='>'&&!in_quote){ /* ignore '>'s if inside quotes */
|
|
tag_type type;
|
|
std::string::size_type
|
|
bname=buf.find_first_not_of("\t\n\r "),
|
|
ename=bname==std::string::npos?
|
|
std::string::npos:
|
|
buf.find_first_of("\t\n\r ",bname),
|
|
bprop=ename==std::string::npos?
|
|
std::string::npos:
|
|
buf.find_first_not_of("\t\n\r ",ename);
|
|
if(bname==ename){ /* null name */
|
|
return parse_tag_res(failure);
|
|
}
|
|
else if(buf[bname]=='/'){ /* closing tag */
|
|
type=closing;
|
|
++bname;
|
|
}
|
|
else type=opening;
|
|
tag.name=buf.substr(bname,ename-bname);
|
|
std::transform( /* normalize tag name to lower case */
|
|
tag.name.begin(),tag.name.end(),tag.name.begin(),
|
|
(int(*)(int))std::tolower);
|
|
if(bprop!=std::string::npos){
|
|
tag.properties=buf.substr(bprop,buf.size());
|
|
}
|
|
first=it; /* result good, consume the chars */
|
|
return parse_tag_res(type,tag);
|
|
}
|
|
else{
|
|
if(ch=='"')in_quote=!in_quote;
|
|
buf+=ch;
|
|
}
|
|
}
|
|
return parse_tag_res(failure); /* end reached and found no '>' */
|
|
}
|
|
|
|
/* A character context is just a vector containing the tags enclosing the
|
|
* character, from the outermost level to the innermost.
|
|
*/
|
|
|
|
typedef std::vector<html_tag> html_context_data;
|
|
typedef flyweight<html_context_data> html_context;
|
|
|
|
/* A character is a char code plus its context.
|
|
*/
|
|
|
|
struct character_data
|
|
{
|
|
character_data(char code_=0,html_context context_=html_context()):
|
|
code(code_),context(context_){}
|
|
character_data(const character_data& x):code(x.code),context(x.context){}
|
|
|
|
char code;
|
|
html_context context;
|
|
};
|
|
|
|
bool operator==(const character_data& x,const character_data& y)
|
|
{
|
|
return x.code==y.code&&x.context==y.context;
|
|
}
|
|
|
|
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
|
|
namespace boost{
|
|
#endif
|
|
|
|
std::size_t hash_value(const character_data& x)
|
|
{
|
|
std::size_t res=0;
|
|
boost::hash_combine(res,x.code);
|
|
boost::hash_combine(res,x.context);
|
|
return res;
|
|
}
|
|
|
|
#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
|
|
} /* namespace boost */
|
|
#endif
|
|
|
|
typedef flyweight<character_data> character;
|
|
|
|
/* scan_html converts HTML code into a stream of contextualized characters.
|
|
*/
|
|
|
|
template<typename ForwardIterator,typename OutputIterator>
|
|
void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
|
|
{
|
|
html_context_data context;
|
|
while(first!=last){
|
|
if(*first=='<'){ /* tag found */
|
|
++first;
|
|
parse_tag_res res=parse_tag(first,last);
|
|
if(res.type==opening){ /* add to contex */
|
|
context.push_back(res.tag);
|
|
continue;
|
|
}
|
|
else if(res.type==closing){ /* remove from context */
|
|
/* Pop all tags from the innermost to the matching one; this takes
|
|
* care of missing </xx>s like vg. in <ul><li>hello</ul>.
|
|
*/
|
|
|
|
for(html_context_data::reverse_iterator rit=context.rbegin();
|
|
rit!=context.rend();++rit){
|
|
if(rit->get().name==res.tag.get().name){
|
|
context.erase(rit.base()-1,context.end());
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
*out++=character(*first++,html_context(context));
|
|
}
|
|
}
|
|
|
|
/* HTML-producing utilities */
|
|
|
|
void print_opening_tag(std::ostream& os,const html_tag_data& x)
|
|
{
|
|
os<<"<"<<x.name;
|
|
if(!x.properties.empty())os<<" "<<x.properties;
|
|
os<<">";
|
|
}
|
|
|
|
void print_closing_tag(std::ostream& os,const html_tag_data& x)
|
|
{
|
|
/* SGML declarations (beginning with '!') are not closed */
|
|
|
|
if(x.name[0]!='!')os<<"</"<<x.name<<">";
|
|
}
|
|
|
|
/* change_context takes contexts from and to with tags
|
|
*
|
|
* from<- c1 ... cn fn+1 ... fm
|
|
* to <- c1 ... cn tn+1 ... tk
|
|
*
|
|
* (that is, they share the first n tags, n might be 0), and
|
|
* produces code closing fm ... fn+1 and opening tn+1 ... tk.
|
|
*/
|
|
|
|
template<typename OutputIterator>
|
|
void change_context(
|
|
const html_context_data& from,const html_context_data& to,
|
|
OutputIterator out)
|
|
{
|
|
std::ostringstream oss;
|
|
html_context_data::const_iterator
|
|
it0=from.begin(),
|
|
it0_end=from.end(),
|
|
it1=to.begin(),
|
|
it1_end=to.end();
|
|
for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
|
|
while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
|
|
while(it1!=it1_end)print_opening_tag(oss,*it1++);
|
|
std::string str=oss.str();
|
|
std::copy(str.begin(),str.end(),out);
|
|
}
|
|
|
|
/* produce_html is passed a bunch of contextualized characters and emits
|
|
* the corresponding HTML. The algorithm is simple: tags are opened and closed
|
|
* as a result of the context from one character to the following changing.
|
|
*/
|
|
|
|
template<typename ForwardIterator,typename OutputIterator>
|
|
void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
|
|
{
|
|
html_context context;
|
|
while(first!=last){
|
|
if(first->get().context!=context){
|
|
change_context(context,first->get().context,out);
|
|
context=first->get().context;
|
|
}
|
|
*out++=(first++)->get().code;
|
|
}
|
|
change_context(context,html_context(),out); /* close remaining context */
|
|
}
|
|
|
|
/* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
|
|
* find some friend operators in certain contexts.
|
|
*/
|
|
|
|
character dummy1;
|
|
html_tag dummy2;
|
|
|
|
int main()
|
|
{
|
|
std::cout<<"input html file: ";
|
|
std::string in;
|
|
std::getline(std::cin,in);
|
|
std::ifstream ifs(in.c_str());
|
|
if(!ifs){
|
|
std::cout<<"can't open "<<in<<std::endl;
|
|
std::exit(EXIT_FAILURE);
|
|
}
|
|
typedef std::istreambuf_iterator<char> istrbuf_iterator;
|
|
std::vector<char> html_source;
|
|
std::copy(
|
|
istrbuf_iterator(ifs),istrbuf_iterator(),
|
|
std::back_inserter(html_source));
|
|
|
|
/* parse the HTML */
|
|
|
|
std::vector<character> scanned_html;
|
|
scan_html(
|
|
html_source.begin(),html_source.end(),std::back_inserter(scanned_html));
|
|
|
|
/* Now that we have the text as a vector of contextualized characters,
|
|
* we can shuffle it around and manipulate in almost any way we please.
|
|
* For instance, the following reverses the central portion of the doc.
|
|
*/
|
|
|
|
std::reverse(
|
|
scanned_html.begin()+scanned_html.size()/4,
|
|
scanned_html.begin()+3*(scanned_html.size()/4));
|
|
|
|
/* emit the resulting HTML */
|
|
|
|
std::cout<<"output html file: ";
|
|
std::string out;
|
|
std::getline(std::cin,out);
|
|
std::ofstream ofs(out.c_str());
|
|
if(!ofs){
|
|
std::cout<<"can't open "<<out<<std::endl;
|
|
std::exit(EXIT_FAILURE);
|
|
}
|
|
typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
|
|
produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));
|
|
|
|
return 0;
|
|
}
|