August 4, 2008
Distinguishing potential spammers from regular humans by using their profile data as a training corpus: a scratch gist.
require 'rubygems'
require 'stemmer'
require 'classifier'
require 'yaml'
# Load previous classifications to help train
humans = YAML::load_file('humans.yml')
spammers = YAML::load_file('spammers.yml')
# Create our Bayes / LSI classifier
classifier = Classifier::Bayes.new('humans', 'spammers')
# Train the classifier
spammers.each { |spammy| classifier.train_spammers spammy }
humans.each { |human| classifier.train_humans human }
users = dbh.query("select uid, uname from users where str_to_date(user_regdate, '%b %d, %Y') > date_sub(now(), interval 1 year)")
users.each_hash do |row|
begin
accepts = dbh.prepare("select count(*) from links where submitter = ? and rejected !=1")
a = accepts.execute(row['uname']).fetch[0].to_s
print "#{a} "
rejects = dbh.prepare("select count(*) from links where submitter = ? and rejected =1")
r = rejects.execute(row['uname']).fetch[0].to_s
print "#{r} "
q = dbh.prepare("insert into rbr (uid, accepts, rejects, uname) values (?,?,?,?)")
q.execute( row['uid'], a, r, row['uname'] )
puts "X #{row['uname']}"
rescue Mysql::Error => e
puts "Error code: #{e.errno}, Error message: #{e.error}"
end
@spammers = {}
spammers = dbh.query("select concat(submitter, ' ', u.email, ' ', u.user_avatar, ' ', u.user_aim, ' ', u.user_icq, ' ', u.user_from, ' ', u.user_interest, ' ', u.url) as data, rejected_reason from links l, users u where rejected = 1 and l.submitter = u.uname order by lid desc limit 100")
spammers.each_hash do |row|
@spammers["#{row['data']}"] = "#{row['rejected_reason']}"
end
@known_goods = {}
known_goods = dbh.query("select concat(u.uname, ' ', u.email, ' ', u.user_avatar, ' ', u.user_aim, ' ', u.user_icq, ' ', u.user_from, ' ', u.user_interest, ' ', u.url) as data from users u, XForum_members f where f.username = u.uname order by postnum desc limit 100")
known_goods.each_hash do |row|
@known_goods["#{row['data']}"] = ""
end
@questionables = {}
@rand = rand(14000)
@rand2 = @rand + 100
questionables = dbh.query(%Q{select concat(u.uname, ' ', u.email, ' ', u.user_avatar, ' ', u.user_aim, ' ', u.user_icq, ' ', u.user_from, ' ', u.user_interest, ' ', u.url) as data from users u where uid > "#{@rand.to_i}" and uid < "#{@rand2.to_i}"})
questionables.each_hash do |row|
@questionables["#{row['data']}"] = ""
end