Skip to content

Commit

Permalink
chore(ua): remove default user agent spoof
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jul 5, 2023
1 parent d7482c7 commit c3619fd
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 27 deletions.
66 changes: 44 additions & 22 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "website_crawler"
version = "0.8.7"
version = "0.8.8"
authors = ["Jeff Mendez <[email protected]>"]
edition = "2021"
description = "gRPC tokio based web crawler"
Expand All @@ -23,7 +23,7 @@ regex = { version = "^1.5.0", optional = true }
hashbrown = { version = "0.13.2" }
log = "0.4.16"
lazy_static = "1.4.0"
ua_generator = { git = "https://github.com/a11ywatch/ua_generator.git", version = "0.3.5" }
ua_generator = { git = "https://github.com/a11ywatch/ua_generator.git", version = "0.3.5", optional = true }
percent-encoding = "2.1.0"
env_logger = "0.9.0"
string_concat = "0.0.1"
Expand Down Expand Up @@ -61,3 +61,4 @@ os_info = "3"
[features]
jemalloc = ["jemallocator", "jemalloc-sys"]
regex = ["dep:regex"]
ua_generator = ["dep:ua_generator"]
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ brew install protobuf

`jemalloc` - use jemalloc memory allocator (default disabled).
`regex` - use the regex crate for blacklist urls validation.
`ua_generator` - use the ua_generator crate to spoof random user agent.

## About

Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
extern crate sitemap;
extern crate tokio;
extern crate tonic;

#[cfg(feature = "ua_generator")]
extern crate ua_generator;

// packages mainly for spider
Expand Down
19 changes: 19 additions & 0 deletions src/packages/spider/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,22 @@ impl Configuration {
}
}
}

/// get the user agent from the top agent list randomly.
#[cfg(any(feature = "ua_generator"))]
pub fn get_ua() -> &'static str {
ua_generator::ua::spoof_ua()
}

/// get the user agent via cargo package + version.
#[cfg(not(any(feature = "ua_generator")))]
pub fn get_ua() -> &'static str {
use std::env;

lazy_static! {
static ref AGENT: &'static str =
concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
};

AGENT.as_ref()
}
5 changes: 2 additions & 3 deletions src/packages/spider/website.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::black_list::contains;
use super::configuration::Configuration;
use super::configuration::{Configuration, get_ua};
use super::page::{build, get_page_selectors, Page};
use super::robotparser::RobotFileParser;
use super::utils::log;
Expand Down Expand Up @@ -28,7 +28,6 @@ use tokio::task;
use tokio::task::JoinSet;
use tokio_stream::StreamExt;
use tonic::transport::Channel;
use ua_generator::ua::spoof_ua;
use url::Url;

/// Represents a website to crawl and gather all links.
Expand Down Expand Up @@ -194,7 +193,7 @@ impl Website {
.pool_idle_timeout(None)
.user_agent(match &self.configuration.user_agent {
Some(ua) => ua.as_str(),
_ => spoof_ua(),
_ => &get_ua(),
})
.brotli(true);

Expand Down

0 comments on commit c3619fd

Please sign in to comment.