Hduoj用户AC题数统计爬虫
趁着前几天出去比赛的空余时间瞎写完了
一向写python都是现写现查库怎么用
写这个主要是为了班里统计方便 ,但现在应该用不上了2333
写的挺垃圾,也是第一次搞这个,因为库太好用了感觉自己写的也没啥水平哈哈
其中主要点是判断指定日期的做题AC数
用了下用户实时的提交页面
http://acm.hdu.edu.cn/status.php?first=&pid=&user=aaa&lang=0&status=0
1.爬下直到指定日期最后所有做题情况
2.判断时间是否符合,顺便学习了下datetime
- 因为日期是从前往后排所以 大于当前日期的跳过,直到遇见最后日期跳出循环
- 需要注意的就是如果指定日期加之前这个用户没有答题过 会造成死循环 所以判断下当前是否为最后一页 因为get参数没有页数所以就判断first是否重复就ok
3.判断是否ac
4.丢到dict
分享一下渣渣代码
# encoding=utf-8
import requests
import re
import xlrd
import xlwt
from bs4 import BeautifulSoup
import datetime
def save_mysql():
pass
def save_excel(users, Allsolved, pid, sum):
book = xlwt.Workbook(encoding='utf-8') # 表格初始化
sheet1 = book.add_sheet('sheet1', cell_overwrite_ok=True)
heads = ['ID', u'指定日期累计AC题数' + '(' + start + 'to' + end + ')', u'总共已AC题数', u'指定日期AC题号']
print u'\n准备将数据存入表格...'
ii = 0 # 表格初始化用
for head in heads:
sheet1.write(0, ii, head)
ii += 1
ID_col = 0
weekid = 3
week_solved_col = 1
solved_col = 2
row = 1
sheet1.col(week_solved_col).width = 256 * 20
for user in users:
sheet1.write(row, ID_col, user)
row += 1
row = 1
for solved in Allsolved:
sheet1.write(row, solved_col, solved)
row += 1
row = 1
for wid in pid:
sheet1.write(row, weekid, wid)
row += 1
row = 1
for wsum in sum:
sheet1.write(row, week_solved_col, wsum)
row += 1
book.save('Acm' + start + 'to' + end + '.xls')
print u'\n录入成功!'
def run(users):
All_Solved = []
ID = []
SUM = []
for user in users:
########目前已做总题数
userpages = requests.get("http://acm.hdu.edu.cn/userstatus.php?user=" + user)
n = re.compile(r'>(.*)</h1>')
s = re.compile(r'Solved</td><td align=center>([0-9]+)<')
name = n.search(userpages.text)
solved = s.search(userpages.text)
print name.group(1) + " " + solved.group(1)
All_Solved.append(solved.group(1))
########每日统计
sum = 0
print user
first = '99999999'
run_forever = True
repid = ['|'] # 防止重复+统计题号
while run_forever:
req = requests.get(
"http://acm.hdu.edu.cn/status.php?first=" + first + "&pid=&user=" + user + "&lang=0&status=0")
soup = BeautifulSoup(req.text, 'lxml')
tables = soup.findAll('table')
tab = tables[3]
old = []
new = []
dict = {'Runid': '123', 'Subtime': '2017', 'Status': 'ac', 'Id': '123'}
for tr in tab.findAll('tr')[1:]:
i = 1
for td in tr.findAll('td')[:4]:
if i == 1:
dict['Runid'] = td.getText()
Runid = td.getText()
elif i == 2:
dict['Subtime'] = td.getText()
elif i == 3:
dict['Status'] = td.getText()
else:
dict['Id'] = td.getText()
i += 1
old.append(dict.copy())
if first == str(int(Runid) - 1): # 防止当这个用户这个时间段没做过题的情况(死循环)
break
for AoW in old:
time = datetime.datetime.strptime(AoW['Subtime'][0:-9], '%Y-%m-%d')
if time <= d2 and time >= d1:
ac = 'Accepted'
# print AoW
if AoW['Status'] == ac:
for cfid in repid:
if AoW['Id'] != cfid:
f = 1
# T.append(AoW['Id'])
else: # 如果重复 退出循环
f = 0
break
if f == 1:
repid.append(AoW['Id']) # 防止重复
repid.append(' ')
new.append(AoW)
elif time < d1:
run_forever = False
# break
else:
pass
####
for tt in new:
print tt
sum += len(new)
first = str(int(Runid) - 1) # 最后一个
print '------'
print sum
print '------'
ID.append(repid)
SUM.append(sum)
# print ID
# print SUM
# print SUM
save_excel(users, All_Solved, ID, SUM)
def main():
# 读入userID
book = xlrd.open_workbook('./acmid.xlsx')
sheet = book.sheet_by_name('sheet1')
users = sheet.col_values(0)
print users
run(users)
if __name__ == '__main__':
print '日期输入格式:年-月-日'
start = raw_input("起始日期:")
end = raw_input("终止日期:")
d1 = datetime.datetime.strptime(start, '%Y-%m-%d')
d2 = datetime.datetime.strptime(end, '%Y-%m-%d')
main()




