@conference {2963, title = {Image extraction from online text streams}, booktitle = {The 2010 IEEE International Symposium on Mining and Web (MAW10), Perth, Australia}, year = {2010}, month = {20 - 23 April }, pages = { 609 - 614}, abstract = {In this paper we present an efficient system that processes HTML pages in order to extract the useful images from them. The proposed mechanism is template independent and is focalized on HTML pages that include news articles from major portals and blogs. As useful images we define the pictures that are relevant to the news report. In order to extract the image objects of the article we deconstruct the HTML page to its DOM model and we apply a set of algorithms in order to clean and correct the HTML code, locate and characterize each node of the DOM model and finally keep the nodes that are characterized as useful nodes. The proposed mechanism is applied as a subsystem of peRSSonal, a web tool that is used to obtain news articles from all over the world, process them and present them back to the end users in a personalized manner. The role of the mechanism is to feed peRSSonal?s database with digital images for browsing and searching purposes. We present the basic algorithms and experimental results on the efficiency of the proposed implementation.}, author = {Christos Bouras and Vassilis Poulopoulos and George Adam} }