Daemian Mack home

Bayesian spam classifier: spammers vs humans

August 4, 2008

Distinguishing potential spammers from regular humans by using their profile data as a training corpus: a scratch gist.

require 'rubygems'
require 'stemmer'
require 'classifier'
require 'yaml'

# Load previous classifications to help train
humans     = YAML::load_file('humans.yml')
spammers = YAML::load_file('spammers.yml')

# Create our Bayes / LSI classifier
classifier = Classifier::Bayes.new('humans', 'spammers')

# Train the classifier
spammers.each { |spammy| classifier.train_spammers spammy }
humans.each { |human| classifier.train_humans human }

users = dbh.query("select uid, uname from users where str_to_date(user_regdate, '%b %d, %Y') > date_sub(now(), interval 1 year)")
users.each_hash do |row|
begin
  accepts = dbh.prepare("select count(*) from links where submitter = ? and rejected !=1")
  a = accepts.execute(row['uname']).fetch[0].to_s
  print "#{a} "
  rejects = dbh.prepare("select count(*) from links where submitter = ? and rejected =1")
  r = rejects.execute(row['uname']).fetch[0].to_s
  print "#{r} "
  q = dbh.prepare("insert into rbr (uid, accepts, rejects, uname) values (?,?,?,?)")
  q.execute( row['uid'], a, r, row['uname'] )
  puts "X #{row['uname']}"
rescue Mysql::Error => e
  puts "Error code: #{e.errno}, Error message: #{e.error}"
end

@spammers = {}
spammers = dbh.query("select concat(submitter, ' ', u.email, ' ', u.user_avatar, ' ', u.user_aim, ' ', u.user_icq, ' ', u.user_from, ' ', u.user_interest, ' ', u.url) as data, rejected_reason from links l, users u where rejected = 1 and l.submitter = u.uname order by lid desc limit 100")

spammers.each_hash do |row|
  @spammers["#{row['data']}"] = "#{row['rejected_reason']}"
end

@known_goods = {}
known_goods = dbh.query("select concat(u.uname, ' ', u.email, ' ', u.user_avatar, ' ', u.user_aim, ' ', u.user_icq, ' ', u.user_from, ' ', u.user_interest, ' ', u.url) as data from users u, XForum_members f where f.username = u.uname order by postnum desc limit 100")

known_goods.each_hash do |row|
  @known_goods["#{row['data']}"] = ""
end

@questionables = {}
@rand = rand(14000)
@rand2 = @rand + 100
questionables = dbh.query(%Q{select concat(u.uname, ' ', u.email, ' ', u.user_avatar, ' ', u.user_aim, ' ', u.user_icq, ' ', u.user_from, ' ', u.user_interest, ' ', u.url) as data from users u where uid > "#{@rand.to_i}" and uid < "#{@rand2.to_i}"})

questionables.each_hash do |row|
  @questionables["#{row['data']}"] = ""
end