Python爬虫练习 Instagram follow(ing/er) list crawler-58码农网

github网址

学习笔记

复习到Python爬虫写法比对两笔资料(difflib)写法pickle储存sessiontry except写报错方法路径、读写文件、生成Excel(openpyxl->workbook)写法
待更新用法、补README.md

main.py

# from pprint import pprintfrom bs4 import BeautifulSoupimport requests, pickleimport refrom config import username,password,headers,url,ajax_url,p_url,pathfrom datetime import datetimefrom openpyxl import Workbookimport compareimport os, sysdef create_folder(path):    if not os.path.isdir(path):        os.makedirs(path)def ask_excel(ask_option):    flag = []    yes_list = ['y','Y','yes']    no_list = ['n','N','no','']    while flag not in yes_list and flag not in no_list:        flag = input(f'Do u want {ask_option}? y/n [n]: ')        if(flag not in yes_list and flag not in no_list):            print('plz enter y or n or ENTER!!!')    # print(f'flag={flag}')    if(flag in yes_list):        return 1    elif(flag in no_list):        return 0def do_excel(uid,date,opt_title,option,root_json,path):              # 跑生成excel    wb = Workbook()    ws = wb.active    title = ['username', 'full_name', 'profile_pic']    ws.append(title)    for users in root_json['users']:        id = []        id.append('@'+users['username'])        id.append(users['full_name'])        id.append(users['profile_pic_url']+'.jpg')        ws.append(id)    wb.save(path+f'{uid}{date}{opt_title[option]}.xlsx')def do_txt(uid,date,opt_title,option,root_json,path):                # 跑生成txt     i=1    with open(path+f'{uid}{date}{opt_title[option]}.txt', 'w+',encoding='utf-8') as f:        for users in root_json['users']:            # id = (f'{i}','@'+users['username'], users['full_name'])            id = (f'@'+users['username'], users['full_name'])            i+=1        # reresponse = response.text.replace('\\u0026','&')            f.write(str(id)+'\n')        f.write(f'Total: {i-1} records!')    print(f'Got {i-1} records!!!')def main():    date = datetime.now().strftime("%Y%m%d-%H%M")    time = int(datetime.now().timestamp())    payload = {        'username': f'{username}',        'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time}:{password}',        'queryParams': {},        'optIntoOneTap': 'false'    }    print(f'If target is private account, you have to follow it first!!!')    while True:        uid = str(input('Enter id: '))        if (uid == ''):            print(f'Do not leave blank!!!')        else:            break    while True:        opt_list = {'':'following','1':'following','2':'followers','following':'following','followers':'followers'}        option = str(input('following[1]/followers[2] [1]: '))        if(option in opt_list):            option = opt_list[option]            break        else:            print(f'enter 1 or 2 or following or followers!!!')    try:        fcount = int(input('Enter max num of following/followers [2000]: '))    except ValueError:        fcount = 2000    opt_title = {        'following': 'fwi',        'followers': 'fwr',}    ask = ask_excel('Excel file(will have profile pic)')    # print(ask)    with requests.session() as session:        if not os.path.exists(f'{path}{username}session.pkl'):            print('Getting sessions')      #session = requests.sess.....            res = session.get(url)            csrf = re.findall(r"csrf_token\":\"(.*?)\"",res.text)[0]            cookies = res.cookies                   #res获取第一次cookie和csrf            cookies['csrf'] = csrf            headers['x-csrftoken'] = csrf            # print(headers)            session.post(ajax_url, data=payload, headers=headers, cookies=cookies)              with open(f'{path}{username}session.pkl', 'wb') as f:                pickle.dump(session.cookies, f)        #用现有cookie和csrf token 去取得登入的session            headers['Referer'] = f'https://www.instagram.com/{uid}/following/'        # print(req2.text)        else:            print('Reloading sessions and updating cookies')            headers['Referer'] = f'https://www.instagram.com/{uid}/following/'            with open(f'{path}{username}session.pkl', 'rb') as f:                cookies = session.cookies.update(pickle.load(f))                headers['x-csrftoken'] = session.cookies['csrftoken']                # print(session.cookies)                # print(headers)        fsi=session.get(p_url+uid,cookies=cookies,headers=headers)            # print(fsi.text)                try:            # print(str(re.findall(r"id\":\"(.*?)\"",fsi.text)))            friendid = str(re.findall(r"id\":\"(.*?)\"",fsi.text)[1])            checkid = str(re.findall(r"id\":\"(.*?)\"",fsi.text)[-1])            if(friendid == '236' or friendid == None or checkid == '236'):                raise Exception            print(f"userid:{friendid}")        except:            os.remove(f'{path}{username}session.pkl')            print(f'error while checking userid')            print(f'1.plz check the target username(no @)!!!')            print(f'2.Make sure u set the right USERNAME and PASSWORD in *config.py* file!!!')            print(f'3.your account might block by instagram server, plz try again later or change your ip!!')            sys.exit()        # url的后辍 可以像翻页一样去增加再爬取 或是直接爆max来爬取        params = {        'count': fcount,        'max_id': '',        'search_surface': 'follow_list_page'}        response = session.get(f'https://i.instagram.com/api/v1/friendships/{friendid}/{option}/', params=params, cookies=cookies, headers=headers)        # print(response.text)        try:            root_json = response.json()        except requests.exceptions.JSONDecodeError as jsonError:            print(f'Error when processing json file: {jsonError}')            print(f'1.Make sure u set the right USERNAME and PASSWORD in *config.py* file!!!')            print(f'2.your account might block by instagram server, plz try again later or change your ip!!')            sys.exit()                if(ask == 1):        try:            do_excel(uid,date,opt_title,option,root_json,path)        except IOError as error:            print(f'Error when generate Excel file:{error}')    # pprint(response.text)    do_txt(uid,date,opt_title,option,root_json,path)    ask2 = ask_excel('compare with old file')    if(ask2 == 1):        try:            f1 = path+input(f'Enter first filename(older file): ')+'.txt'            f2 = path+f'{uid}{date}{opt_title[option]}'+'.txt'            compare.compare_file(f1, f2)        except IOError as error:            print(f'Error when generate compared.txt file:{error}')            print(f'Make sure u have the file existed and enter the right filename(without .txt)!!!')            sys.exit()if __name__ == '__main__':    create_folder(path)    if username == 'USERNAME or EMAIL':        print(f'plz go config.py to set your USERNAME and PASSWORD')        os._exit(0)    main()

compare.py

import sysimport difflibimport osfrom config import pathdef create_folder(path):    if not os.path.isdir(path):        os.makedirs(path)# 读取配置文件函数def read_file(file_name):    try:        file_handle = open(file_name, 'r', encoding="utf-8")        text = file_handle.read().splitlines()  # 读取后以行进行分割        file_handle.close()        return text    except IOError as error:        print('Read file Error: {0}'.format(error))        sys.exit()# 比较两个文件并输出html格式的结果def compare_file(file1_name, file2_name):    if file1_name == "" or file2_name == "":        print('文件路径不能为空: file1_name的路径为: {0}, file2_name的路径为: {1} .'.format(file1_name, file2_name))        sys.exit()    text1_lines = read_file(file1_name)    text2_lines = read_file(file2_name)    print_list = ['+','-']    no_print_list = ['!']    d = difflib.Differ()    diff_print = [a for a in d.compare(text1_lines, text2_lines) if a[0] in print_list and a[-1] not in no_print_list ]    if diff_print:        print('record of compare will be storage in compared.txt and result.html !')        try:            with open(path+'compared.txt', 'w', encoding="utf-8") as result_file:                result_file.write('\n'.join(diff_print))            print('\n'.join(diff_print))        except IOError as error:            print('写入compare.txt文件时发生错误:{0}'.format(error))    else:        print(f"It's all same as old file")    diff = difflib.HtmlDiff()  # 创建htmldiff 对象    result = diff.make_file(text1_lines, text2_lines)  # 通过make_file 方法输出 html 格式的对比结果    #  将结果保存到result.html文件中并打开    try:        with open(path+'result.html', 'w', encoding="utf-8") as result_file:      #同 f = open('result.html', 'w') 打开或创建一个result.html文件            result_file.write(result)                      #同 f.write(result)    except IOError as error:        print('写入html文件错误:{0}'.format(error))if __name__ == '__main__':    create_folder(path)    f1 = path+input(f'Enter first filename(older file): ')+'.txt'    f2 = path+input(f'Enter second filename(newer file): ')+'.txt'    compare_file(f1, f2)

config.py

#Ur Instagram username and passwordusername = 'USERNAME or EMAIL'password = 'PASSWORD'url = f'https://www.instagram.com/accounts/login/'ajax_url = f'https://www.instagram.com/accounts/login/ajax/'p_url = f'https://i.instagram.com/api/v1/users/web_profile_info/?username='path = r'./data/'headers = {'authority': 'www.instagram.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','accept-language': 'zh-TW,zh;q=0.9','dnt': '1','sec-ch-prefers-color-scheme': 'dark','sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'document','sec-fetch-mode': 'navigate','sec-fetch-site': 'none','sec-fetch-user': '?1','sec-gpc': '1','upgrade-insecure-requests': '1','user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 105.0.0.11.118 (iPhone11,8; iOS 12_3_1; en_US; en-US; scale=2.00; 828x1792; 165586599)','viewport-width': '1707','X-Requested-With': 'XMLHttpRequest',}

给这篇文章的作者打赏

关于作者: 网站小编

相关文章

HBO Max vs.Netflix：当你负担不起两者时如何选择

课内笔记整理---作业系统实务(资安相关篇)

excel vba捞网页数据问题

热门文章

1Python爬虫练习 Instagram follow(ing/er) list crawler

2对CISSP 的一些思考

3杀虫记 #1 [shell script] - syntax error: unexpected "....&

4[CSS学习笔记] CSS Transition 转场 | 我在路易莎的日子

5DDoS 是什么? DDoS 攻击解决方法与防护策略