#! /usr/local/bin/ruby # $Id: rss2nntp.rb 213 2008-12-26 16:00:57Z fred $ # # Copyright (c) 2006-2007 Fr?d?ric Senault. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of Frédéric Senault or any contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # RSS2NNTP module, to convert RSS articles to NNTP articles begin require 'rubygems' rescue Exception end require 'feed_tools' require 'uri' require 'open3' require 'tempfile' require 'iconv' require 'html/htmltokenizer' require 'htmlentities' PRINTABLE = (0..255).collect { false } [ 10, 13, 32, (33..126).to_a ].flatten.each { |x| PRINTABLE[x] = true } class RSS2NNTPConf include Singleton def parse(f = nil) @conf = { :path => 'rss2nntp', :hierarchy => 'nonexistent.test.', :converter => '/usr/local/bin/lynx -dump -force_html ' + '-assume_charset iso-8859-15 -nolist ', :host => Socket.gethostname, :server => 'localhost', :abuse => 'abuse@' + Socket.gethostname.gsub(/^[^.]*\./, ''), :program => 'Lacave.net RSS2NNTP aggregator', :version => '0.7', :usesig => true, :debug => false, :verbose => false, :post => true, } @feeds = [] lstf = [] if f.nil? f = $0.dup end f.gsub!(/\/[^\/]*$/,'/') f += "rss2nntp.conf" File.open(f) do |h| a = '' while true l = h.gets if !l.nil? l.chomp! l.gsub!(/#.*$/, '') if l =~ /^\s/ a += l.gsub(/^\s+/, ' ') next end end if a != '' a.strip! if a =~ /^([A-Z0-9]+)\s*=\s*(.+)$/ key = $1.downcase.to_sym val = $2 @conf[key] = val else lstf << a end end a = l break if l.nil? end [ :usesig, :debug, :verbose, :post ].each do |k| if @conf[k].is_a? String @conf[k] = (@conf[k].downcase == 'yes' || @conf[k].downcase == 'true') end end end @conf[:hierarchy] << '.' unless @conf[:hierarchy][-1..-1] == '.' lstf.each do |a| f = RSSConfFeed.new(a) @feeds << f end self end def []=(key, val) @conf[key] = val end def [](key) @conf[key] end def feed(url) @feeds.find { |f| f.url == url } end def feeds @feeds end end class RSSConfFeed attr_accessor :url, :group, :gzip @@conf = nil def initialize(line) @@conf = RSS2NNTPConf.instance unless @@conf case line when /^([^\s]+)$/ @url = $1 setgroup('') when /^(.+?)\s+(.+)$/ @url = $1 setgroup($2) end if @url[0..4] == 'gzip:' @gzip = true @url = @url[5..-1] else @gzip = false end end def setgroup(group) unless group.empty? if group[0..0] == '/' @group = group[1..-1] else @group = @@conf[:hierarchy] + group end else @group = '' end end def gzip? @gzip end end class RSSArticle < Net::NNTP::Article @@conf = nil @@conv = nil def initialize(item, feed, cfgfeed) super() @@conf = RSS2NNTPConf.instance unless @@conf @@conv ||= Iconv.new('ISO-8859-15//TRANSLIT', 'UTF-8') @@htmld ||= HTMLEntities.new @tags = [] @body = parse_body(item, feed) if cfgfeed.group.empty? cfgfeed.setgroup(RSSArticle.make_group(cfgfeed.url)) end self['Path'] = @@conf[:path] self['Subject'] = if item.title.nil? || item.title == '' 'Pas de sujet' else @@conv.iconv(@@htmld.decode(item.title)) end self['From'] = @@conv.iconv(@@htmld.decode(make_from(item, feed))) self['Newsgroups'] = cfgfeed.group self['Message-ID'] = RSSArticle.make_mid(item, cfgfeed.group) self['Date'] = make_rss_date(item) self['NNTP-Posting-Host'] = @@conf[:host] self['NNTP-Posting-Date'] = RSSArticle.make_date(Time.now) self['Lines'] = @body.scan(/\n/).length self['Mime-Version'] = '1.0' self['Content-Type'] = 'text/plain; charset="iso-8859-15"' self['Content-Transfer-Encoding'] = '8bit' self['X-Trace'] = "#{@@conf[:host]} #{Time.now.to_i} "+ "#{Process.pid} 127.0.0.1 (#{Time.now.to_s})" self['X-Complaints-To'] = @@conf[:abuse] self['User-Agent'] = @@conf[:program] + ' ' + @@conf[:version] self['Keywords'] = make_keywords(item.categories) self end def self.make_group(url) u = URI.parse(url) g = u.host.split(/\./).reverse g.shift if g[0] == 'co' || g[0] == 'uc' g.shift end if g[-1] == 'www' g.pop end g.join('.') end def self.make_mid(item, group) guid = (item.guid ? item.guid : item.link) mid = '' if guid =~ /^http:\/\// u = URI.parse(guid) l = u.path.gsub(/\//, '.') l.gsub!(/^\./, '') l.gsub!(/\.$/, '') l.gsub!(/index.[a-z]{2,6}\./, '') l.gsub!(/\.(html?|xml)/, '') if !u.query.nil? && u.query != '' l += (l == '' ? '' : '.') + "#{u.query}" end if !u.fragment.nil? && u.fragment != '' l += (l == '' ? '' : '.') + "#{u.fragment}" end mid = "#{l}" else mid = guid if(mid =~ /^tag:([^,])+,/) mid.gsub!(/^tag:([^,])+,/,'') end end if mid.empty? && !item.published.nil? mid = item.published.to_s end if mid.empty? && !item.time.nil? mid = item.time.to_s end if mid.empty? mid = item.title end mid.gsub!(/[:;,=+]/, '.') mid.gsub!(/[^a-zA-Z0-9._-]/, '') mid = "<#{mid}.1@#{group}>" end def parse_body(item, feed) b = '' tf = Tempfile.new('rss2news') content = @@conv.iconv(item.content) rescue nil if content.nil? content = " " * item.content.length item.content.unpack("C*").each_with_index do |v, i| content[i, 1] = (PRINTABLE[v] ? [ v ].pack("C*") : '?') end end tf.puts pre_parse_links(content) tf.close Open3.popen3(@@conf[:converter] + " #{tf.path}") do |i, o, e| b.each do |l| i.puts(b) end i.close_write b = '' while(l = o.gets) l.gsub!(/^ /, '') b += l end end tf.close(true) b = post_parse_links(b) b << "\n" unless(b[-1,1] == "\n") if(@@conf[:usesig]) then b << "-- \n" end b << "Original : #{item.link}\nSite : #{feed.link}\n" b end def pre_parse_links(content) r = '' lok = -1 ll = false tk = HTMLTokenizer.new(content) acc = '' while(t = tk.getNextToken) if(t.is_a?(HTMLTag) && t.tag_name == 'a') then if(t.attr_hash['href'] && \ t.attr_hash['href'] =~ /^(http|mailto|ftp|news|nntp)/) then @tags.push(t.attr_hash['href']) lok = @tags.length else lok = -1 end elsif(t.is_a?(HTMLTag) && (t.tag_name == '/a' || t.tag_name == 'img')) then if(acc != '') then r << acc if(lok == -1 || acc.strip != @tags[lok - 1].strip) acc = '' end if(t.attr_hash['alt'] && t.attr_hash['alt'] != '') then r << ' ' unless(r[-1,1] == ' ') r << '[' + t.attr_hash['alt'] + ']' end if(t.tag_name == 'img') then @tags.push(t.attr_hash['src']) r << '&-' + @tags.length.to_s else if(lok != -1) then r << '&-' + lok.to_s else r << '&-lien' end lok = -1 end r << '-&' else if(lok == -1) then r << t.to_s else acc << t.to_s end end end r end def post_parse_links(content) return content if(@tags.length == 0) r = content.gsub(/( +)?&-((?:lien|img|\d+)(?::.*?)?)-&([ ()'`?"])?/) do |m| e = '' e << ($1.nil? ? ' ' : $1.squeeze) unless($`[-1,1] == "\n" || $`.length == 0) if($2[0..2] == 'img' && $2.length > 3) then e << "[#{$2[3..-1]}]" elsif($2[0..2] == 'img') then e << "[image]" else e << "[#{$2}]" end e << ($3.nil? ? ' ' : $3) end r << "\n" unless(r =~ /\n\n$/) f = '%' + (Math.log10(@tags.length).to_i + 1).to_s + 'd' @tags.each_index do |i| @tags[i].gsub!(/&/, '&') @tags[i].gsub!(/'/, '\'') r << "[#{f % [ i + 1 ] }] : #{@tags[i]}\n" end r end def make_from(item, feed) if(item.author.nil? || item.author.name.nil? || item.author.name == '') then if(feed.title.nil?) then "Unknown" else feed.title end else item.author.name end end def make_keywords(c) if(c.nil?) then '' else c.collect { |c| ( c.label.nil? ? c.term : c.label ) }.join(',') end end def make_rss_date(item) d = nil if(!item.published.nil?) then d = item.published end if(d.nil? && !item.time.nil?) then d = item.time end if(d.nil?) then d = Time.now end Net::NNTP::Article.make_date(d) end end