#! /usr/local/bin/ruby # Copyright (c) 2007 Frédéric Senault. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of Frédéric Senault or any contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # Production du fichier : grep -rhm1 '^Path:' . | sort | uniq -c # ou list de path brute, mais beaucoup plus gourmand en mémoire #require "pp" require "optparse" $cdir = File.dirname(__FILE__) $par = { :me => /lacave\.net/, :fin => "./fpaths", :fcache => "./cpaths", :fout => "./feed.png", :dout => "./feedmap.dot", :dot => "/usr/local/bin/dot", :nbr_nodes => 50, :cutoff_edges => 400, :nbr_prop => 10_000.0 } def dohelp(res) puts <<-_EOH #{res} (FeedMap version 0.2) INFILE peut être généré par la commande : grep -rhm1 '^Path:' . | sort | uniq -c Dans un spool traditionel. NODES représente le nombre de nodes maximum à afficher sur le graphe (avant la détection des nodes orphelins). EDGES est le nombre d'articles (cfr PROPORTION) à partir duquel il faut supprimer les flèches. PROPORTION est le nombre d'articles sur lequel est ramene le compte d'articles. _EOH exit end if(!File.exists?($par[:dot])) $par[:dot] = `whereis -bq dot`.chomp end help = false opts = OptionParser.new do |opt| opt.on("-m ME", String) { |v| $par[:me] = v } opt.on("-i INFILE", String) { |v| $par[:fin] = v } opt.on("-I") { $par[:fin] = STDIN } opt.on("-o OUTFILE", String) { |v| $par[:fout] = v } opt.on("-c CACHEFILE", String) { |v| $par[:fcache] = v } opt.on("-d DOTFILE", String) { |v| $par[:dout] = v } opt.on("-D DOT", String) { |v| $par[:dot] = v } opt.on("-n NODES", Integer) { |v| $par[:nbr_nodes] = v ; raise "-n doit être > 0" if(v <= 0) } opt.on("-e EDGES", Integer) { |v| $par[:cutoff_edges] = v ; raise "-e doit être > 0" if(v <= 0) } opt.on("-p PROPORTION", Float) { |v| $par[:nbr_prop] = v ; raise "-p doit être > 0" if(v <= 0) } opt.on("-h") { help = true } end opts.parse! dohelp(opts.to_s) if(help) TLD = /^(?:[a-z]{2}|net|org|com|info|biz)$/ TRN = [ [ /^.*club-internet\.*$/, 'club-internet.fr' ], [ /^.*google\.com$/, 'googlegroups.com' ], [ /^(?:ip|newsfeeds?|news|feed(?:er)?s?|newspeers?|peers?|peering|nntp)\d*\./ , '' ], [ /telenet-ops/, 'telenet' ], [ /.*\.blueyonder\.co\.uk/, 'blueyonder.co.uk' ], [ /.*\.isp\.belgacom\.be/, 'belgacom.be' ], [ /.*\.videotron\.net/, 'videotron.net' ], [ /^.*\.uu\.net/, 'uu.net' ], ] FLT = [ /^swipnet$/, /^(?:\d+\.){4}(?i-:MISMATCH)$/, /^.*\.(?i-:POSTED)$/, /^[a-z]+-berlin\.de$/, /^.*\.free\.fr$/, /^grolier$/, /^.*clubint.net$/, /^not-for-mail$/, /^sn-[a-z]+-[a-z]+-\d+$/, /^(?:\d+\.){3}\d+$/, /^(cybercancel|bincancel|udp|robomod|cyberspam|usenet|udp-bot|hip-?crime-.*|pfilter-v.*)$/, /^.*\.(ams|phx)$/, /^[a-f0-9]{8}$/, /^$/, ] def canonicalize(path) p = path.downcase.split(/!/) p.each do |ep| TRN.each { |r, v| ep.gsub!(r, v) } end p.reject! do |ep| FLT.any? { |f| ep =~ f } end (1..p.length-1).each do |i| if(p[i] !~ /\./) if(p[i-1] =~ /(\.|^)#{p[i]}\./) p[i] = p[i-1] end elsif(p[i-1] !~ /\./) if(p[i] =~ /(\.|^)#{p[i-1]}\./) p[i-1] = p[i] end else e1 = p[i].split(/\./).reverse e2 = p[i-1].split(/\./).reverse ce = [] m = false (0..(e1.length < e2.length ? e1.length - 1 : e2.length - 1)).each do |j| if e1[j] !~ TLD && e2[j] !~ TLD break if(e1[j] != e2[j]) ce << e1[j] m = true else ce << e1[j] end end if m #puts p[i-1] + " + " + p[i] + " => " + ce.reverse.join('.') p[i] = ce.reverse.join('.') p.delete_at(i-1) redo else #puts p[i-1] + " + " + p[i] + " X" end end end p.uniq end all = [] seen = Hash.new(0) t = 0 puts "Canonicalisation." totart = 0 c = 0 if(File.exists? $par[:fcache]) print "Lecture du cache : " cmode = true f = $par[:fcache] else print "Progression : " cmode = false f = $par[:fin] end File.open(f).each do |l| if(c % 100 == 0) print "." STDOUT.flush end l.chomp! w = l.to_i w = (w == 0 ? 1 : w) l.gsub!(/^\s*(\d+)\s*.*Path: /, '') if(cmode) p = l.split(/!/).collect { |e| e.to_sym } else p = canonicalize(l).collect { |e| e.to_sym } end p.each { |e| seen[e] += w } totart += w all << [w, p] c += 1 end puts if(!cmode) puts "Sauvegarde des paths canonicalisés." n = File.open($par[:fcache], "w") tall = all.collect { |p| [ p[0], p[1].join('!') ] } tall.sort_by { |p| p[1] } lp = "" w = 0 tall.each do |p| if(p[1] != lp) n.puts "%10d Path: %s" % [ w, lp ] unless(w == 0) lp = p[1] w = p[0] else w += p[0] end end n.puts "%10d Path: %s" % [ w, lp ] unless(lp == "") n.close end $nodes = {} # Hash.new { |k, v| k[v] = Node.new(v, -1) } $edges = nil class Node attr_accessor :name, :feed, :post, :seen, :rank, :edges def initialize(name, seen, rank) @name = name @seen = seen @rank = rank @feed = 0 @post = 0 @edges = [] end def post(w = 1) @post += w end def feed(w = 1) @feed += w end def rank_slice(max) ((@rank * max) / $par[:nbr_nodes]).to_i end def attach(e) @edges << e end def detach(e) @edges.delete(e) if(@edges.length == 0) $nodes.delete(@name) end end end class Edge attr_accessor :from, :to, :cpt def initialize(v) @from = $nodes[v[0]] @to = $nodes[v[1]] @cpt = 0 @from.attach(self) @to.attach(self) end def count(w) @cpt += w end end puts "Extraction des nodes." $edges = Hash.new { |k, v| k[v] = Edge.new(v) } keep = Hash.new(0) rank = seen.collect { |k, v| [ k, v ] }.sort_by { |e| -e[1] }[0, $par[:nbr_nodes]] rank.each_index do |i| keep[rank[i][0]] = rank[i][1] n = Node.new(rank[i][0], rank[i][1], i + 1) $nodes[n.name] = n end puts "Extraction des edges." all.each do |wp| w, p = wp op = p[-1] p.reject! { |e| keep[e] == 0 } p.uniq! if(p.length > 1) (1..p.length-1).each do |i| t, f = p[i - 1], p[i] e = $edges[[f, t]] e.count(w) e.to.feed(w) end end if(p[-1] == op) $nodes[p[-1]].post(w) else $nodes[p[-1]].feed(w) end end puts "Filtrage des edges." $edges.reject! do |k, e| if((e.cpt * $par[:nbr_prop] / totart) < $par[:cutoff_edges]) e.from.detach(e) e.to.detach(e) true end end puts "Génération des nodes et edges du bord." cv = 0 $nodes.keys.each do |k| n = $nodes[k] t = n.feed n.edges.each { |e| t -= e.cpt if(n == e.to) } if((t * $par[:nbr_prop] / totart) >= $par[:cutoff_edges]) nk = "virt_#{cv}" v = Node.new(nk, t, $par[:nbr_nodes] + 1) $nodes[nk] = v e = $edges[[nk, k]] e.count(t) cv += 1 end end shape = 'box' style = 'filled,rounded' nattr = { :shape => shape, :style => style, :color => '#000033', :fillcolor => '#ccccff', :fontname => 'FreeMono', :fontsize => '12', } eattr = { :color => '#000033', :fontname => 'FreeMono', :fontsize => '10', } puts "Création du fichier dot." np = nattr.inject([]) { |r, k| r << %<#{k[0].to_s}="#{k[1]}"> }.join(',') ep = eattr.inject([]) { |r, k| r << %<#{k[0].to_s}="#{k[1]}"> }.join(',') File.open($par[:dout], 'w') do |d| d.puts %|digraph G {| d.puts %| graph [overlap=scalexy];| d.puts %| node [#{np}];| $nodes.each do |k, n| if(n.name =~ /^virt_/) d.puts %| "#{n.name}" [style="invis" ];| #label="" fillcolor="#ffffff" color="#ffffff" ];| else cl = (6 + n.rank_slice(8)).to_s(16) np = (n.post * $par[:nbr_prop] / totart).round nf = (n.feed * $par[:nbr_prop] / totart).round if np > 0 l = "#{n.name}\\n#{np} (#{nf})" else l = "#{n.name}\\n(#{nf})" end d.puts %| "#{n.name}" [label="#{l}" fillcolor="##{cl*4}ff" ];| end end d.puts %| edge [#{ep}];| $edges.each do |k, e| nc = (e.cpt * 10000.0 / totart).round if(e.from.name =~ /^virt_/) d.puts %| "#{e.from.name}" -> "#{e.to.name}" [label="#{nc}" style="dashed"];| else d.puts %| "#{e.from.name}" -> "#{e.to.name}" [label="#{nc}"];| end end d.puts %|}| end system("/usr/local/bin/dot -Tpng -o #{$par[:fout]} < #{$par[:dout]}") if($par[:dot]) puts "Fin."