在数据采集和分析工作中,Instagram 评论数据是一类非常有价值的内容。它可以反映用户的互动、情绪和热点话题。官方的 Graph API 需要认证和权限,而很多时候我们希望通过 不登录账号 的方式模拟请求来抓取评论。
Instagram使用GraphQL通过后端查询动态生成帖子浏览量。此端点返回不同的帖子数据,包括评论,点赞以及评论者信息等。因此我们可以利用此GraphQL端点来抓取Instagram帖子数据。本文将详细说明步骤及方法作为参考。
1. GraphQL 请求端点
以下是用于检索帖子数据的Graph端点
众所周知,每个GraphQL请求都是需要一个HTTP主体。对于帖子页面抓取,需要以下值:
"csrftoken": "4OkRB9KIREX0imrqGS-3nn",
"__dyn":"7xeUjG1mxu1syUbFp41twpUnwgU7SbzEdF8aUco2qwJw5ux609vCwjE1EE2Cw8G11wBz81s8hwGxu786a3a1YwBgao6C0Mo2swtUd8-U2zxe2GewGw9a361qw8Xxm16wa-0oa2-azo7u3vwDwHg2ZwrUdUbGwmk0zU8oC1Iwqo5p0OwUQp1yUb8jxKi2qi7E5y4UrwHwcObBK4o16UswFwtF8",
"__csr":"g9YYrFmGt9AZfFv_VbuBl5KjGnvZSiQ-VpFdu9F3e8LAiGiEzyoa9u8F4ppCfX-EjKZFKmKuqidzFBAGbWxZ4Azox5zWplBBwAgGrxW5tS9ByQfggQuQqqm8zlgiggF4zVFUa6UDKq4XK8Cy8jxi6EB1i3eaxWaw05A1o3mwa21lwlUmgbWw73wio0wi0IdwdOkPxJ0Mw9a0eRw1Wi3lxqcw7Fwj87ut0d2q8ixd1rg6J09_g1eo3qe0hi5ohgKawaK1Xc00Cb80mMw1K2",
"__hsdp":"giMB0zOE8R2450clVE99qxi2m6opaE4C3Wdo56cwzw9FxG4A3m1vwiUgwi9E29iAecwXxy5obEao12E98sxauE98cU0w20esw4cwsEpwnU4y1wwDwYwa20xE3Ewcq0Syxe0hi6U5K2a5OGu1Twc-",
"__hblp":"0Pw8y1ey89axG2l7w4swJG0C8d8cU4CmdDAxC2qbAy98eFEaEeEfU8opxy54iq2a9xTwde3q2i78KquE98cUa8dE3VwtEb-bwkU3Qw60wci0L84-2e68dopwHy8aEcomwkU462-0E826wey0NEiwho5O2gEjwcm1exK2C26fx2cJ1sJ7x21Cx108i3y7Ueonw",
"shortcode":"DGLUo_6tKgY", # Post unique identifier
"doc_id": f"{INSTAGRAM_DOC_ID}", # Constant ID (identifier) used for Instagram posts
从以上细节我们可以得出结论,帖子 ID(短代码)是通过POST向 GraphQL 端点发送请求来抓取 Instagram 帖子页面所需的唯一变量。
2. 抓取帖子数据示例
让我们将此功能添加到我们的 Instagram 抓取工具中:
Python
import json
import requests
cookies = {
"csrftoken": "4OkRB9KIREX0imrqGS-3nn",
"datr": "nRmUaG1CyG0YIekOcKDNZ0UI",
"ig_did": "F11F1FA5-C00E-4CF6-A249-2A917DD31183",
"mid": "aJQZnQALAAG_bJTx2M9CiAw52Zr7",
"ig_nrcb": "1",
"wd": "163x848",
"ps_l": "1",
"ps_n": "1",
}
headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"content-type": "application/x-www-form-urlencoded",
"origin": "https://www.instagram.com",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://www.instagram.com/p/DMzTPkHody0/",
"sec-ch-prefers-color-scheme": "dark",
"sec-ch-ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
"sec-ch-ua-full-version-list": '"Not)A;Brand";v="8.0.0.0", "Chromium";v="138.0.7204.184", "Google Chrome";v="138.0.7204.184"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
"x-csrftoken": "4OkRB9KIREX0imrqGS-3nn",
"x-fb-friendly-name": "PolarisPostActionLoadPostQueryQuery",
"x-ig-app-id": "936619743392459",
}
data = {
"variables": '{"shortcode":"DGLUo_6tKgY","fetch_tagged_user_count":null,"hoisted_comment_id":null,"hoisted_reply_id":null}',
"doc_id": "29599222026389233",
}
response = requests.post(
"https://www.instagram.com/graphql/query",
cookies=cookies,
headers=headers,
data=data,
)
with open("./test.json", "w", encoding="utf-8") as f:
json.dump(response.json(), f, ensure_ascii=False, indent=2)
上面的 Instagram 抓取代码将返回整个帖子数据集,包括各种字段,例如帖子标题、评论、点赞和其他信息。但是,它也包含许多标志和不必要的字段,这些字段用处不大。
3.解析Instagram帖子数据
Instagram 帖子数据比用户个人资料数据更加复杂。因此,我们需要根据自己的需求进行简化处理,以减小其大小:
Python
def recombineVideoOrSidecarData(input_url, original_json):
with open("./test.json", "w", encoding="utf-8") as f:
json.dump(original_json, f, ensure_ascii=False, indent=4)
try:
print(" [Processor] Reconstructing video data in memory...")
media=original_json.get("data",{}).get("xdt_shortcode_media")
if not media:
raise ValueError("Processing failed: 'data.xdt_shortcode_media' node not found in the incoming JSON object.")
If "Video" in media.get("__typename") and media.get("video_duration"):
print("--------------Process data into video posts-----------")
caption_node = (
media.get("edge_media_to_caption", {})
.get("edges", [{}])[0]
.get("node", {})
)
caption_text = caption_node.get("text", "")
def extract_tags(text, prefix):
if not text:
return []
regex = re.compile(rf"\{prefix}[\w.]+")
return regex.findall(text)
comments_edges = media.get("edge_media_to_parent_comment", {}).get(
"edges", []
)
latest_comments = []
for edge in comments_edges:
comment_node = edge.get("node")
if not comment_node:
continue
formatted_comment = _format_comment_node(comment_node)
if formatted_comment:
latest_comments.append(formatted_comment)
tagged_users_edges = media.get("edge_media_to_tagged_user", {}).get(
"edges", []
)
tagged_users = []
for edge in tagged_users_edges:
user_node = edge.get("node", {}).get("user")
if user_node:
tagged_users.append(user_node)
music_info_raw = media.get("clips_music_attribution_info", {})
image_data_edges = media.get("edge_sidecar_to_children", {}).get(
"edges", []
)
transformed_post = {
"inputUrl": input_url,
"id": media.get("id"),
"type": media.get("__typename", "").replace("XDTGraph", ""),
"shortCode": media.get("shortcode"),
"caption": caption_text,
"hashtags": extract_tags(caption_text, "#"),
"mentions": extract_tags(caption_text, "@"),
"url": f"https://www.instagram.com/p/{media.get('shortcode')}/",
"commentsCount": media.get("edge_media_to_parent_comment", {}).get(
"count", 0
),
"firstComment": (
comments_edges[0].get("node", {}).get("text", "")
if comments_edges
else ""
),
"latestComments": latest_comments,
"dimensionsHeight": media.get("dimensions", {}).get("height"),
"dimensionsWidth": media.get("dimensions", {}).get("width"),
"displayUrl": media.get("display_url"),
"images": [],
"videoUrl": media.get("video_url"),
"alt": media.get("accessibility_caption"),
"likesCount": media.get("edge_media_preview_like", {}).get("count", 0),
"videoViewCount": media.get("video_view_count"),
"videoPlayCount": media.get("video_play_count"),
"timestamp": datetime.fromtimestamp(
media.get("taken_at_timestamp", 0), tz=timezone.utc
).isoformat(),
"childPosts": [],
"locationName": "",
"locationId": "",
"ownerFullName": media.get("owner", {}).get("full_name"),
"ownerUsername": media.get("owner", {}).get("username"),
"ownerId": media.get("owner", {}).get("id"),
"productType": media.get("product_type"),
"videoDuration": media.get("video_duration"),
"isSponsored": media.get("is_paid_partnership", False),
"taggedUsers": tagged_users,
"musicInfo": {
"artist_name": music_info_raw.get("artist_name", ""),
"song_name": music_info_raw.get("song_name", ""),
"uses_original_audio": music_info_raw.get(
"uses_original_audio", False
),
"should_mute_audio": music_info_raw.get("should_mute_audio", False),
"should_mute_audio_reason": music_info_raw.get(
"should_mute_audio_reason", ""
),
"audio_id": music_info_raw.get("audio_id", "0"),
},
"isCommentsDisabled": media.get("comments_disabled"),
}
def format_timestamps_recursively(obj):
if isinstance(obj, dict):
for key, value in obj.items():
if key == "timestamp" and isinstance(value, str):
obj[key] = value.replace("+00:00", "Z")
else:
format_timestamps_recursively(value)
elif isinstance(obj, list):
for item in obj:
format_timestamps_recursively(item)
format_timestamps_recursively(transformed_post)
print(f" [Processor] ✅ Video data {media.get('shortcode')} reorganization successful.")
return [transformed_post]
以上是处理视频帖子评论数据的代码样例,因为静态图片类处理方式不同,还需自己编写对应的处理函数。