Skip to content

Commit

Permalink
feat(spider): add spider engine crate
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 12, 2024
1 parent 3326ab8 commit d2ebbe5
Show file tree
Hide file tree
Showing 30 changed files with 497 additions and 4,183 deletions.
807 changes: 441 additions & 366 deletions Cargo.lock

Large diffs are not rendered by default.

30 changes: 5 additions & 25 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "website_crawler"
version = "0.8.12"
version = "0.9.0"
authors = ["Jeff Mendez <[email protected]>"]
edition = "2021"
description = "gRPC tokio based web crawler"
Expand All @@ -12,34 +12,14 @@ categories = ["accessibility", "asynchronous"]
include = ["src/*", "build.rs", "proto/*", "LICENSE", "README.md"]

[dependencies]
tokio = { version = "1.33.0", features = [ "rt-multi-thread", "macros", "sync", "time", "parking_lot" ] }
tokio-stream = "0.1.14"
tokio = { version = "1.35.1", features = [ "rt-multi-thread", "macros", "sync", "time", "parking_lot" ] }
tonic = { version = "0.9.2" }
prost = "0.11.3"
prost-types = "0.11.2"
reqwest = { version = "0.11.18", features = ["deflate", "brotli", "gzip", "native-tls-alpn", "socks", "stream" ] }
url = "2.4.0"
regex = { version = "^1.5.0", optional = true }
hashbrown = { version = "0.13.2" }
log = "0.4.16"
lazy_static = "1.4.0"
ua_generator = { git = "https://github.com/a11ywatch/ua_generator.git", version = "0.3.5", optional = true }
percent-encoding = "2.1.0"
env_logger = "0.9.0"
string_concat = "0.0.1"
sitemap = "0.4.1"
xml-rs = "0.8.4"
compact_str = "0.7.1"
selectors = "0.24.0"
tendril = "0.4.3"
ahash = "0.8.3"
matches = "0.1.10"
cssparser = "0.29.6"
smallvec = "1.10.0"
ego-tree = "0.6.2"
fast_html5ever = "0.26.1"
num_cpus = "1.15.0"
case_insensitive_string = { version = "0.1.6", features = ["compact"] }
spider = { version = "1.80.68", features = ["sync", "control", "sitemap"]}

[target.'cfg(all(not(target_os = "android"), not(target_os = "freebsd")))'.dependencies]
jemallocator = { version = "0.5.0", optional = true }
Expand All @@ -60,5 +40,5 @@ os_info = "3"

[features]
jemalloc = ["jemallocator", "jemalloc-sys"]
regex = ["dep:regex"]
ua_generator = ["dep:ua_generator"]
regex = ["spider/regex"]
chrome = ["spider/chrome"]
4 changes: 2 additions & 2 deletions examples/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub mod crawler {
extern crate lazy_static;
use crate::tokio::macros::support::Pin;
use tokio::sync::mpsc;
use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt};
use spider::tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt};
use tonic::{Request, Response, Status};
pub use website::website_service_server::{WebsiteService, WebsiteServiceServer};
pub use website::{Empty, ScanInitParams, ScanParams, ScanStreamResponse};
Expand Down Expand Up @@ -73,7 +73,7 @@ impl WebsiteService for MyWebsiteService {
message: req.domain,
});

let mut stream = Box::pin(tokio_stream::iter(repeat));
let mut stream = Box::pin(spider::tokio_stream::iter(repeat));
let (tx, rx) = mpsc::channel(1);

match stream.next().await {
Expand Down
11 changes: 0 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,16 @@
extern crate sitemap;
extern crate tokio;
extern crate tonic;

#[cfg(feature = "ua_generator")]
extern crate ua_generator;

// packages mainly for spider
extern crate hashbrown;
extern crate log;
extern crate reqwest;
extern crate url;
#[macro_use]
extern crate lazy_static;
pub extern crate compact_str;
pub use packages::spider;
#[macro_use]
extern crate fast_html5ever;
#[macro_use]
extern crate string_concat;

// internal packages.
pub mod interface;
pub mod packages;
pub mod rpc;
pub mod scanner;
pub use rpc::handlers::grpc_start;
3 changes: 0 additions & 3 deletions src/packages/mod.rs

This file was deleted.

1 change: 0 additions & 1 deletion src/packages/robotparser/mod.rs

This file was deleted.

Loading

0 comments on commit d2ebbe5

Please sign in to comment.