During periods of high load you may find the notebook unresponsive. It may appear to execute a cell, update the completion number in brackets [#] at the left of the cell but you may find the cell has not executed. This is particularly obvious on print statements when there is no output. If this happens, restart the kernel using the command under the Kernel tab.
In retrieval augmented generation (RAG), an LLM retrieves contextual documents from an external dataset as part of its execution.
This is useful if we want to ask question about specific documents (e.g., our PDFs, a set of videos, etc).
#! pip install langchain
import os
import openai
import sys
sys.path.append('../..')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']
Let's load a PDF transcript from Andrew Ng's famous CS229 course! These documents are the result of automated transcription so words and sentences are sometimes split unexpectedly.
# The course will show the pip installs you would need to install packages on your own machine.
# These packages are already installed on this platform and should not be run again.
#! pip install pypdf
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()
Each page is a Document
.
A Document
contains text (page_content
) and metadata
.
len(pages)
22
page = pages[0]
print(page.page_content[0:500])
MachineLearning-Lecture01 Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine learning class. So what I wanna do today is ju st spend a little time going over the logistics of the class, and then we'll start to talk a bit about machine learning. By way of introduction, my name's Andrew Ng and I'll be instru ctor for this class. And so I personally work in machine learning, and I' ve worked on it for about 15 years now, and I actually think that machine learning i
page.metadata
{'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
# ! pip install yt_dlp
# ! pip install pydub
Note: This can take several minutes to complete.
url="https://www.youtube.com/watch?v=jGwO_UgTS7I"
save_dir="docs/youtube/"
loader = GenericLoader(
YoutubeAudioLoader([url],save_dir),
OpenAIWhisperParser()
)
docs = loader.load()
[youtube] Extracting URL: https://www.youtube.com/watch?v=jGwO_UgTS7I [youtube] jGwO_UgTS7I: Downloading webpage [youtube] jGwO_UgTS7I: Downloading android player API JSON [info] jGwO_UgTS7I: Downloading 1 format(s): 140 [download] docs/youtube//Stanford CS229: Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a has already been downloaded [download] 100% of 69.71MiB [ExtractAudio] Not converting audio docs/youtube//Stanford CS229: Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a; file is already in target format m4a Transcribing part 1!
--------------------------------------------------------------------------- JSONDecodeError Traceback (most recent call last) File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:669, in APIRequestor._interpret_response_line(self, rbody, rcode, rheaders, stream) 668 try: --> 669 data = json.loads(rbody) 670 except (JSONDecodeError, UnicodeDecodeError) as e: File /usr/local/lib/python3.9/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 343 if (cls is None and object_hook is None and 344 parse_int is None and parse_float is None and 345 parse_constant is None and object_pairs_hook is None and not kw): --> 346 return _default_decoder.decode(s) 347 if cls is None: File /usr/local/lib/python3.9/json/decoder.py:337, in JSONDecoder.decode(self, s, _w) 333 """Return the Python representation of ``s`` (a ``str`` instance 334 containing a JSON document). 335 336 """ --> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 338 end = _w(s, end).end() File /usr/local/lib/python3.9/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx) 354 except StopIteration as err: --> 355 raise JSONDecodeError("Expecting value", s, err.value) from None 356 return obj, end JSONDecodeError: Expecting value: line 1 column 1 (char 0) The above exception was the direct cause of the following exception: APIError Traceback (most recent call last) Cell In[13], line 7 2 save_dir="docs/youtube/" 3 loader = GenericLoader( 4 YoutubeAudioLoader([url],save_dir), 5 OpenAIWhisperParser() 6 ) ----> 7 docs = loader.load() File /usr/local/lib/python3.9/site-packages/langchain/document_loaders/generic.py:90, in GenericLoader.load(self) 88 def load(self) -> List[Document]: 89 """Load all documents.""" ---> 90 return list(self.lazy_load()) File /usr/local/lib/python3.9/site-packages/langchain/document_loaders/generic.py:86, in GenericLoader.lazy_load(self) 84 """Load documents lazily. Use this when working at a large scale.""" 85 for blob in self.blob_loader.yield_blobs(): ---> 86 yield from self.blob_parser.lazy_parse(blob) File /usr/local/lib/python3.9/site-packages/langchain/document_loaders/parsers/audio.py:51, in OpenAIWhisperParser.lazy_parse(self, blob) 49 # Transcribe 50 print(f"Transcribing part {split_number+1}!") ---> 51 transcript = openai.Audio.transcribe("whisper-1", file_obj) 53 yield Document( 54 page_content=transcript.text, 55 metadata={"source": blob.source, "chunk": split_number}, 56 ) File /usr/local/lib/python3.9/site-packages/openai/api_resources/audio.py:57, in Audio.transcribe(cls, model, file, api_key, api_base, api_type, api_version, organization, **params) 55 requestor, files, data = cls._prepare_request(file, file.name, model, **params) 56 url = cls._get_url("transcriptions") ---> 57 response, _, api_key = requestor.request("post", url, files=files, params=data) 58 return util.convert_to_openai_object( 59 response, api_key, api_version, organization 60 ) File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:226, in APIRequestor.request(self, method, url, params, headers, files, stream, request_id, request_timeout) 205 def request( 206 self, 207 method, (...) 214 request_timeout: Optional[Union[float, Tuple[float, float]]] = None, 215 ) -> Tuple[Union[OpenAIResponse, Iterator[OpenAIResponse]], bool, str]: 216 result = self.request_raw( 217 method.lower(), 218 url, (...) 224 request_timeout=request_timeout, 225 ) --> 226 resp, got_stream = self._interpret_response(result, stream) 227 return resp, got_stream, self.api_key File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:619, in APIRequestor._interpret_response(self, result, stream) 611 return ( 612 self._interpret_response_line( 613 line, result.status_code, result.headers, stream=True 614 ) 615 for line in parse_stream(result.iter_lines()) 616 ), True 617 else: 618 return ( --> 619 self._interpret_response_line( 620 result.content.decode("utf-8"), 621 result.status_code, 622 result.headers, 623 stream=False, 624 ), 625 False, 626 ) File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:671, in APIRequestor._interpret_response_line(self, rbody, rcode, rheaders, stream) 669 data = json.loads(rbody) 670 except (JSONDecodeError, UnicodeDecodeError) as e: --> 671 raise error.APIError( 672 f"HTTP code {rcode} from API ({rbody})", rbody, rcode, headers=rheaders 673 ) from e 674 resp = OpenAIResponse(data, rheaders) 675 # In the future, we might add a "status" parameter to errors 676 # to better handle the "error while streaming" case. APIError: HTTP code 504 from API (<html> <head><title>504 Gateway Time-out</title></head> <body> <center><h1>504 Gateway Time-out</h1></center> </body> </html> )
docs[0].page_content[0:500]
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")
docs = loader.load()
print(docs[0].page_content[:500])
handbook/37signals-is-you.md at master · basecamp/handbook · GitHub Skip to content Toggle navigation Sign up Product Actions Automate any workflow Packages Host and manage packages Security Find and fix vulnerabilities Code
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()
print(docs[0].page_content[0:200])
# Blendle's Employee Handbook This is a living document with everything we've learned working with people while running a startup. And, of course, we continue to learn. Therefore it's a document that
docs[0].metadata
{'source': "docs/Notion_DB/Blendle's Employee Handbook e367aa77e225482c849111687e114a56.md"}