Hduoj用户AC题数统计爬虫
趁着前几天出去比赛的空余时间瞎写完了
一向写python都是现写现查库怎么用
写这个主要是为了班里统计方便 ,但现在应该用不上了2333
写的挺垃圾,也是第一次搞这个,因为库太好用了感觉自己写的也没啥水平哈哈
其中主要点是判断指定日期的做题AC数
用了下用户实时的提交页面
http://acm.hdu.edu.cn/status.php?first=&pid=&user=aaa&lang=0&status=0
1.爬下直到指定日期最后所有做题情况
2.判断时间是否符合,顺便学习了下datetime
- 因为日期是从前往后排所以 大于当前日期的跳过,直到遇见最后日期跳出循环
- 需要注意的就是如果指定日期加之前这个用户没有答题过 会造成死循环 所以判断下当前是否为最后一页 因为get参数没有页数所以就判断first是否重复就ok
3.判断是否ac
4.丢到dict
分享一下渣渣代码
# encoding=utf-8 import requests import re import xlrd import xlwt from bs4 import BeautifulSoup import datetime def save_mysql(): pass def save_excel(users, Allsolved, pid, sum): book = xlwt.Workbook(encoding='utf-8') # 表格初始化 sheet1 = book.add_sheet('sheet1', cell_overwrite_ok=True) heads = ['ID', u'指定日期累计AC题数' + '(' + start + 'to' + end + ')', u'总共已AC题数', u'指定日期AC题号'] print u'\n准备将数据存入表格...' ii = 0 # 表格初始化用 for head in heads: sheet1.write(0, ii, head) ii += 1 ID_col = 0 weekid = 3 week_solved_col = 1 solved_col = 2 row = 1 sheet1.col(week_solved_col).width = 256 * 20 for user in users: sheet1.write(row, ID_col, user) row += 1 row = 1 for solved in Allsolved: sheet1.write(row, solved_col, solved) row += 1 row = 1 for wid in pid: sheet1.write(row, weekid, wid) row += 1 row = 1 for wsum in sum: sheet1.write(row, week_solved_col, wsum) row += 1 book.save('Acm' + start + 'to' + end + '.xls') print u'\n录入成功!' def run(users): All_Solved = [] ID = [] SUM = [] for user in users: ########目前已做总题数 userpages = requests.get("http://acm.hdu.edu.cn/userstatus.php?user=" + user) n = re.compile(r'>(.*)</h1>') s = re.compile(r'Solved</td><td align=center>([0-9]+)<') name = n.search(userpages.text) solved = s.search(userpages.text) print name.group(1) + " " + solved.group(1) All_Solved.append(solved.group(1)) ########每日统计 sum = 0 print user first = '99999999' run_forever = True repid = ['|'] # 防止重复+统计题号 while run_forever: req = requests.get( "http://acm.hdu.edu.cn/status.php?first=" + first + "&pid=&user=" + user + "&lang=0&status=0") soup = BeautifulSoup(req.text, 'lxml') tables = soup.findAll('table') tab = tables[3] old = [] new = [] dict = {'Runid': '123', 'Subtime': '2017', 'Status': 'ac', 'Id': '123'} for tr in tab.findAll('tr')[1:]: i = 1 for td in tr.findAll('td')[:4]: if i == 1: dict['Runid'] = td.getText() Runid = td.getText() elif i == 2: dict['Subtime'] = td.getText() elif i == 3: dict['Status'] = td.getText() else: dict['Id'] = td.getText() i += 1 old.append(dict.copy()) if first == str(int(Runid) - 1): # 防止当这个用户这个时间段没做过题的情况(死循环) break for AoW in old: time = datetime.datetime.strptime(AoW['Subtime'][0:-9], '%Y-%m-%d') if time <= d2 and time >= d1: ac = 'Accepted' # print AoW if AoW['Status'] == ac: for cfid in repid: if AoW['Id'] != cfid: f = 1 # T.append(AoW['Id']) else: # 如果重复 退出循环 f = 0 break if f == 1: repid.append(AoW['Id']) # 防止重复 repid.append(' ') new.append(AoW) elif time < d1: run_forever = False # break else: pass #### for tt in new: print tt sum += len(new) first = str(int(Runid) - 1) # 最后一个 print '------' print sum print '------' ID.append(repid) SUM.append(sum) # print ID # print SUM # print SUM save_excel(users, All_Solved, ID, SUM) def main(): # 读入userID book = xlrd.open_workbook('./acmid.xlsx') sheet = book.sheet_by_name('sheet1') users = sheet.col_values(0) print users run(users) if __name__ == '__main__': print '日期输入格式:年-月-日' start = raw_input("起始日期:") end = raw_input("终止日期:") d1 = datetime.datetime.strptime(start, '%Y-%m-%d') d2 = datetime.datetime.strptime(end, '%Y-%m-%d') main()