Skip to content

Commit 9025ed5

Browse files
authored
Merge pull request #146 from autoscrape-labs/feat/network-inspecting
Network inspecting capabilities
2 parents cf814bf + 3b15a3c commit 9025ed5

File tree

8 files changed

+837
-19
lines changed

8 files changed

+837
-19
lines changed

docs/deep-dive/event-system.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ async def scrape_dynamic_content():
312312
if 'api/products' in event['params']['response']['url']:
313313
# Extract the response body
314314
request_id = event['params']['requestId']
315-
body, is_base64 = await tab.get_network_response_body(request_id)
315+
body = await tab.get_network_response_body(request_id)
316316

317317
# Process the data
318318
products = json.loads(body)
@@ -483,7 +483,7 @@ async def dynamic_tab_creation():
483483

484484
# Extract categories from the response
485485
request_id = event['params']['requestId']
486-
body, _ = await main_tab.get_network_response_body(request_id)
486+
body = await main_tab.get_network_response_body(request_id)
487487
categories = json.loads(body)
488488

489489
print(f"Found {len(categories)} categories to process")
@@ -502,7 +502,7 @@ async def dynamic_tab_creation():
502502

503503
# Process the product data
504504
request_id = event['params']['requestId']
505-
body, _ = await tab.get_network_response_body(request_id)
505+
body = await tab.get_network_response_body(request_id)
506506
products = json.loads(body)
507507

508508
# Add to results
@@ -699,10 +699,13 @@ async def log_request(tab, event):
699699
await tab.on(NetworkEvent.REQUEST_WILL_BE_SENT, partial(log_request, tab))
700700

701701
# After performing actions, retrieve logs
702-
api_logs = await tab.get_network_logs(matches=["api", "graphql"])
702+
api_logs = await tab.get_network_logs(filter="api")
703703

704-
# Get response bodies for specific requests
705-
json_responses = await tab.get_network_response_bodies(matches=["api/data"])
704+
# Get response bodies for specific requests by filtering logs first
705+
api_logs = await tab.get_network_logs(filter="api/data")
706+
for log in api_logs:
707+
request_id = log['params']['requestId']
708+
body = await tab.get_network_response_body(request_id)
706709
```
707710

708711
### DOM Events for Structure Monitoring

docs/deep-dive/tab-domain.md

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,236 @@ await tab.on('Page.javascriptDialogOpening', handle_dialog)
455455
await tab.execute_script("alert('This is a test alert')")
456456
```
457457

458+
## Network Analysis Methods
459+
460+
The Tab domain provides specialized methods for analyzing network traffic and extracting response data. These methods require network events to be enabled first.
461+
462+
### Network Logs Retrieval
463+
464+
The `get_network_logs()` method provides access to all captured network requests:
465+
466+
```python
467+
# Enable network monitoring
468+
await tab.enable_network_events()
469+
470+
# Navigate to trigger network requests
471+
await tab.go_to('https://example.com/api-heavy-page')
472+
473+
# Get all network logs
474+
all_logs = await tab.get_network_logs()
475+
print(f"Captured {len(all_logs)} network requests")
476+
477+
# Filter logs by URL content
478+
api_logs = await tab.get_network_logs(filter='api')
479+
static_logs = await tab.get_network_logs(filter='.js')
480+
domain_logs = await tab.get_network_logs(filter='example.com')
481+
482+
print(f"API requests: {len(api_logs)}")
483+
print(f"JavaScript files: {len(static_logs)}")
484+
print(f"Domain requests: {len(domain_logs)}")
485+
```
486+
487+
### Response Body Extraction
488+
489+
The `get_network_response_body()` method allows extraction of actual response content:
490+
491+
```python
492+
from functools import partial
493+
from pydoll.protocol.network.events import NetworkEvent
494+
495+
# Storage for captured responses
496+
captured_responses = {}
497+
498+
async def capture_api_responses(tab, event):
499+
"""Capture response bodies from API calls"""
500+
request_id = event['params']['requestId']
501+
response = event['params']['response']
502+
url = response['url']
503+
504+
# Only capture API responses
505+
if '/api/' in url and response['status'] == 200:
506+
try:
507+
# Extract the response body
508+
body = await tab.get_network_response_body(request_id)
509+
captured_responses[url] = body
510+
print(f"Captured response from: {url}")
511+
except Exception as e:
512+
print(f"Failed to capture response: {e}")
513+
514+
# Enable network monitoring and register callback
515+
await tab.enable_network_events()
516+
await tab.on(NetworkEvent.RESPONSE_RECEIVED, partial(capture_api_responses, tab))
517+
518+
# Navigate to trigger API calls
519+
await tab.go_to('https://example.com/dashboard')
520+
await asyncio.sleep(3) # Wait for API calls
521+
522+
print(f"Captured {len(captured_responses)} API responses")
523+
```
524+
525+
### Practical Network Analysis Example
526+
527+
Here's a comprehensive example combining both methods for thorough network analysis:
528+
529+
```python
530+
import asyncio
531+
import json
532+
from functools import partial
533+
from pydoll.browser.chromium import Chrome
534+
from pydoll.protocol.network.events import NetworkEvent
535+
536+
async def comprehensive_network_analysis():
537+
async with Chrome() as browser:
538+
tab = await browser.start()
539+
540+
# Storage for analysis results
541+
analysis_results = {
542+
'api_responses': {},
543+
'failed_requests': [],
544+
'request_summary': {}
545+
}
546+
547+
async def analyze_responses(tab, event):
548+
"""Analyze network responses"""
549+
request_id = event['params']['requestId']
550+
response = event['params']['response']
551+
url = response['url']
552+
status = response['status']
553+
554+
# Track failed requests
555+
if status >= 400:
556+
analysis_results['failed_requests'].append({
557+
'url': url,
558+
'status': status,
559+
'request_id': request_id
560+
})
561+
return
562+
563+
# Capture successful API responses
564+
if '/api/' in url and status == 200:
565+
try:
566+
body = await tab.get_network_response_body(request_id)
567+
568+
# Try to parse JSON responses
569+
try:
570+
data = json.loads(body)
571+
analysis_results['api_responses'][url] = {
572+
'data': data,
573+
'size': len(body),
574+
'type': 'json'
575+
}
576+
except json.JSONDecodeError:
577+
analysis_results['api_responses'][url] = {
578+
'data': body,
579+
'size': len(body),
580+
'type': 'text'
581+
}
582+
583+
except Exception as e:
584+
print(f"Failed to capture response from {url}: {e}")
585+
586+
# Enable monitoring and register callback
587+
await tab.enable_network_events()
588+
await tab.on(NetworkEvent.RESPONSE_RECEIVED, partial(analyze_responses, tab))
589+
590+
# Navigate and perform actions
591+
await tab.go_to('https://example.com/complex-app')
592+
await asyncio.sleep(5) # Wait for network activity
593+
594+
# Get comprehensive logs
595+
all_logs = await tab.get_network_logs()
596+
api_logs = await tab.get_network_logs(filter='api')
597+
598+
# Generate summary
599+
analysis_results['request_summary'] = {
600+
'total_requests': len(all_logs),
601+
'api_requests': len(api_logs),
602+
'failed_requests': len(analysis_results['failed_requests']),
603+
'captured_responses': len(analysis_results['api_responses'])
604+
}
605+
606+
# Display results
607+
print("🔍 Network Analysis Results:")
608+
print(f" Total requests: {analysis_results['request_summary']['total_requests']}")
609+
print(f" API requests: {analysis_results['request_summary']['api_requests']}")
610+
print(f" Failed requests: {analysis_results['request_summary']['failed_requests']}")
611+
print(f" Captured responses: {analysis_results['request_summary']['captured_responses']}")
612+
613+
# Show failed requests
614+
if analysis_results['failed_requests']:
615+
print("\n❌ Failed Requests:")
616+
for failed in analysis_results['failed_requests']:
617+
print(f" {failed['status']} - {failed['url']}")
618+
619+
# Show captured API data
620+
if analysis_results['api_responses']:
621+
print("\n✅ Captured API Responses:")
622+
for url, info in analysis_results['api_responses'].items():
623+
print(f" {url} ({info['type']}, {info['size']} bytes)")
624+
625+
return analysis_results
626+
627+
# Run the analysis
628+
asyncio.run(comprehensive_network_analysis())
629+
```
630+
631+
### Use Cases for Network Analysis
632+
633+
These network analysis methods enable powerful automation scenarios:
634+
635+
**API Testing and Validation:**
636+
```python
637+
# Validate API responses during automated testing
638+
api_logs = await tab.get_network_logs(filter='/api/users')
639+
for log in api_logs:
640+
request_id = log['params']['requestId']
641+
response_body = await tab.get_network_response_body(request_id)
642+
data = json.loads(response_body)
643+
644+
# Validate response structure
645+
assert 'users' in data
646+
assert len(data['users']) > 0
647+
```
648+
649+
**Performance Monitoring:**
650+
```python
651+
# Monitor request timing and sizes
652+
all_logs = await tab.get_network_logs()
653+
large_responses = []
654+
655+
for log in all_logs:
656+
if 'response' in log['params']:
657+
response = log['params']['response']
658+
if response.get('encodedDataLength', 0) > 1000000: # > 1MB
659+
large_responses.append({
660+
'url': response['url'],
661+
'size': response['encodedDataLength']
662+
})
663+
664+
print(f"Found {len(large_responses)} large responses")
665+
```
666+
667+
**Data Extraction:**
668+
```python
669+
# Extract dynamic content loaded via AJAX
670+
await tab.go_to('https://spa-application.com')
671+
await asyncio.sleep(3) # Wait for AJAX calls
672+
673+
data_logs = await tab.get_network_logs(filter='/data/')
674+
extracted_data = []
675+
676+
for log in data_logs:
677+
request_id = log['params']['requestId']
678+
try:
679+
body = await tab.get_network_response_body(request_id)
680+
data = json.loads(body)
681+
extracted_data.extend(data.get('items', []))
682+
except:
683+
continue
684+
685+
print(f"Extracted {len(extracted_data)} data items")
686+
```
687+
458688
### File Upload Handling
459689

460690
The Tab domain provides a context manager for handling file uploads:

0 commit comments

Comments
 (0)