github网址
学习笔记
复习到Python爬虫写法比对两笔资料(difflib)写法pickle储存sessiontry except写报错方法路径、读写文件、生成Excel(openpyxl->workbook)写法待更新用法、补README.md
main.py
# from pprint import pprintfrom bs4 import BeautifulSoupimport requests, pickleimport refrom config import username,password,headers,url,ajax_url,p_url,pathfrom datetime import datetimefrom openpyxl import Workbookimport compareimport os, sysdef create_folder(path): if not os.path.isdir(path): os.makedirs(path)def ask_excel(ask_option): flag = [] yes_list = ['y','Y','yes'] no_list = ['n','N','no',''] while flag not in yes_list and flag not in no_list: flag = input(f'Do u want {ask_option}? y/n [n]: ') if(flag not in yes_list and flag not in no_list): print('plz enter y or n or ENTER!!!') # print(f'flag={flag}') if(flag in yes_list): return 1 elif(flag in no_list): return 0def do_excel(uid,date,opt_title,option,root_json,path): # 跑生成excel wb = Workbook() ws = wb.active title = ['username', 'full_name', 'profile_pic'] ws.append(title) for users in root_json['users']: id = [] id.append('@'+users['username']) id.append(users['full_name']) id.append(users['profile_pic_url']+'.jpg') ws.append(id) wb.save(path+f'{uid}{date}{opt_title[option]}.xlsx')def do_txt(uid,date,opt_title,option,root_json,path): # 跑生成txt i=1 with open(path+f'{uid}{date}{opt_title[option]}.txt', 'w+',encoding='utf-8') as f: for users in root_json['users']: # id = (f'{i}','@'+users['username'], users['full_name']) id = (f'@'+users['username'], users['full_name']) i+=1 # reresponse = response.text.replace('\\u0026','&') f.write(str(id)+'\n') f.write(f'Total: {i-1} records!') print(f'Got {i-1} records!!!')def main(): date = datetime.now().strftime("%Y%m%d-%H%M") time = int(datetime.now().timestamp()) payload = { 'username': f'{username}', 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time}:{password}', 'queryParams': {}, 'optIntoOneTap': 'false' } print(f'If target is private account, you have to follow it first!!!') while True: uid = str(input('Enter id: ')) if (uid == ''): print(f'Do not leave blank!!!') else: break while True: opt_list = {'':'following','1':'following','2':'followers','following':'following','followers':'followers'} option = str(input('following[1]/followers[2] [1]: ')) if(option in opt_list): option = opt_list[option] break else: print(f'enter 1 or 2 or following or followers!!!') try: fcount = int(input('Enter max num of following/followers [2000]: ')) except ValueError: fcount = 2000 opt_title = { 'following': 'fwi', 'followers': 'fwr',} ask = ask_excel('Excel file(will have profile pic)') # print(ask) with requests.session() as session: if not os.path.exists(f'{path}{username}session.pkl'): print('Getting sessions') #session = requests.sess..... res = session.get(url) csrf = re.findall(r"csrf_token\":\"(.*?)\"",res.text)[0] cookies = res.cookies #res获取第一次cookie和csrf cookies['csrf'] = csrf headers['x-csrftoken'] = csrf # print(headers) session.post(ajax_url, data=payload, headers=headers, cookies=cookies) with open(f'{path}{username}session.pkl', 'wb') as f: pickle.dump(session.cookies, f) #用现有cookie和csrf token 去取得登入的session headers['Referer'] = f'https://www.instagram.com/{uid}/following/' # print(req2.text) else: print('Reloading sessions and updating cookies') headers['Referer'] = f'https://www.instagram.com/{uid}/following/' with open(f'{path}{username}session.pkl', 'rb') as f: cookies = session.cookies.update(pickle.load(f)) headers['x-csrftoken'] = session.cookies['csrftoken'] # print(session.cookies) # print(headers) fsi=session.get(p_url+uid,cookies=cookies,headers=headers) # print(fsi.text) try: # print(str(re.findall(r"id\":\"(.*?)\"",fsi.text))) friendid = str(re.findall(r"id\":\"(.*?)\"",fsi.text)[1]) checkid = str(re.findall(r"id\":\"(.*?)\"",fsi.text)[-1]) if(friendid == '236' or friendid == None or checkid == '236'): raise Exception print(f"userid:{friendid}") except: os.remove(f'{path}{username}session.pkl') print(f'error while checking userid') print(f'1.plz check the target username(no @)!!!') print(f'2.Make sure u set the right USERNAME and PASSWORD in *config.py* file!!!') print(f'3.your account might block by instagram server, plz try again later or change your ip!!') sys.exit() # url的后辍 可以像翻页一样去增加再爬取 或是直接爆max来爬取 params = { 'count': fcount, 'max_id': '', 'search_surface': 'follow_list_page'} response = session.get(f'https://i.instagram.com/api/v1/friendships/{friendid}/{option}/', params=params, cookies=cookies, headers=headers) # print(response.text) try: root_json = response.json() except requests.exceptions.JSONDecodeError as jsonError: print(f'Error when processing json file: {jsonError}') print(f'1.Make sure u set the right USERNAME and PASSWORD in *config.py* file!!!') print(f'2.your account might block by instagram server, plz try again later or change your ip!!') sys.exit() if(ask == 1): try: do_excel(uid,date,opt_title,option,root_json,path) except IOError as error: print(f'Error when generate Excel file:{error}') # pprint(response.text) do_txt(uid,date,opt_title,option,root_json,path) ask2 = ask_excel('compare with old file') if(ask2 == 1): try: f1 = path+input(f'Enter first filename(older file): ')+'.txt' f2 = path+f'{uid}{date}{opt_title[option]}'+'.txt' compare.compare_file(f1, f2) except IOError as error: print(f'Error when generate compared.txt file:{error}') print(f'Make sure u have the file existed and enter the right filename(without .txt)!!!') sys.exit()if __name__ == '__main__': create_folder(path) if username == 'USERNAME or EMAIL': print(f'plz go config.py to set your USERNAME and PASSWORD') os._exit(0) main()
compare.py
import sysimport difflibimport osfrom config import pathdef create_folder(path): if not os.path.isdir(path): os.makedirs(path)# 读取配置文件函数def read_file(file_name): try: file_handle = open(file_name, 'r', encoding="utf-8") text = file_handle.read().splitlines() # 读取后以行进行分割 file_handle.close() return text except IOError as error: print('Read file Error: {0}'.format(error)) sys.exit()# 比较两个文件并输出html格式的结果def compare_file(file1_name, file2_name): if file1_name == "" or file2_name == "": print('文件路径不能为空: file1_name的路径为: {0}, file2_name的路径为: {1} .'.format(file1_name, file2_name)) sys.exit() text1_lines = read_file(file1_name) text2_lines = read_file(file2_name) print_list = ['+','-'] no_print_list = ['!'] d = difflib.Differ() diff_print = [a for a in d.compare(text1_lines, text2_lines) if a[0] in print_list and a[-1] not in no_print_list ] if diff_print: print('record of compare will be storage in compared.txt and result.html !') try: with open(path+'compared.txt', 'w', encoding="utf-8") as result_file: result_file.write('\n'.join(diff_print)) print('\n'.join(diff_print)) except IOError as error: print('写入compare.txt文件时发生错误:{0}'.format(error)) else: print(f"It's all same as old file") diff = difflib.HtmlDiff() # 创建htmldiff 对象 result = diff.make_file(text1_lines, text2_lines) # 通过make_file 方法输出 html 格式的对比结果 # 将结果保存到result.html文件中并打开 try: with open(path+'result.html', 'w', encoding="utf-8") as result_file: #同 f = open('result.html', 'w') 打开或创建一个result.html文件 result_file.write(result) #同 f.write(result) except IOError as error: print('写入html文件错误:{0}'.format(error))if __name__ == '__main__': create_folder(path) f1 = path+input(f'Enter first filename(older file): ')+'.txt' f2 = path+input(f'Enter second filename(newer file): ')+'.txt' compare_file(f1, f2)
config.py
#Ur Instagram username and passwordusername = 'USERNAME or EMAIL'password = 'PASSWORD'url = f'https://www.instagram.com/accounts/login/'ajax_url = f'https://www.instagram.com/accounts/login/ajax/'p_url = f'https://i.instagram.com/api/v1/users/web_profile_info/?username='path = r'./data/'headers = {'authority': 'www.instagram.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','accept-language': 'zh-TW,zh;q=0.9','dnt': '1','sec-ch-prefers-color-scheme': 'dark','sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'document','sec-fetch-mode': 'navigate','sec-fetch-site': 'none','sec-fetch-user': '?1','sec-gpc': '1','upgrade-insecure-requests': '1','user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 105.0.0.11.118 (iPhone11,8; iOS 12_3_1; en_US; en-US; scale=2.00; 828x1792; 165586599)','viewport-width': '1707','X-Requested-With': 'XMLHttpRequest',}