Internship on RAG technology

--- title: $ rag_cli initdb --- flowchart LR subgraph data["`**../data**`"] F1("fa:fa-file ../data/*.txt") end c1("chunk") c2("chunk") c4("chunk") c3("chunk") c5("chunk") subgraph A["`fa:fa-database vector store **~/data/rag/**`"] v1("vector/node") v2("vecteur/node") v3("vector/node") v4("vecteur/node") v5("vecteur/node") end F1 --> c1 & c2 & c3 & c4 & c5 c1 --> v1 c2 --> v2 c3 --> v3 c4 --> v4 c5 --> v5
--- title: $ rag_cli updatedb -- file ../data/new.txt --- flowchart LR subgraph data["`**../data**`"] F1("fa:fa-file ../data/new.txt") end c1("chunk") c2("chunk") subgraph A["`fa:fa-database vector store **~/data/rag/**`"] v1("new vector/node") v2("new vecteur/node") v3("vector/node") v4("vecteur/node") v5("vecteur/node") end F1 ---> c1 & c2 c1 --> v1 c2 --> v2
--- title: crawler.py --- flowchart TB %% data storage urls[("seed urls list")] urls_queue[("urls queue")] urls_visited[("visited urls")] db[("database (VectorIndex)")] %% function is_vis[["is_visited()"]] add_vis[["add_to_visited()"]] add_new[["add_url_to_queue()"]] dow_extr[["extract_new_urls()"]] process[["SimpleWebPageReader()"]] remove[["remove_url_to_queue()"]] urls --> add_new --> urls_queue urls_queue --> is_vis --> dow_extr urls_visited --> process --> db dow_extr --> add_new & add_vis & remove is_vis --> remove add_vis --> urls_visited