#! pip install langchain


import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


# The course will show the pip installs you would need to install packages on your own machine.
# These packages are already installed on this platform and should not be run again.
#! pip install pypdf


from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()


len(pages)

22


page = pages[0]


print(page.page_content[0:500])

MachineLearning-Lecture01  
Instructor (Andrew Ng):  Okay. Good morning. Welcome to CS229, the machine 
learning class. So what I wanna do today is ju st spend a little time going over the logistics 
of the class, and then we'll start to  talk a bit about machine learning.  
By way of introduction, my name's  Andrew Ng and I'll be instru ctor for this class. And so 
I personally work in machine learning, and I' ve worked on it for about 15 years now, and 
I actually think that machine learning i


page.metadata

{'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}


from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader


# ! pip install yt_dlp
# ! pip install pydub


url="https://www.youtube.com/watch?v=jGwO_UgTS7I"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=jGwO_UgTS7I
[youtube] jGwO_UgTS7I: Downloading webpage
[youtube] jGwO_UgTS7I: Downloading android player API JSON
[info] jGwO_UgTS7I: Downloading 1 format(s): 140
[download] docs/youtube//Stanford CS229： Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a has already been downloaded
[download] 100% of   69.71MiB
[ExtractAudio] Not converting audio docs/youtube//Stanford CS229： Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a; file is already in target format m4a
Transcribing part 1!

---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:669, in APIRequestor._interpret_response_line(self, rbody, rcode, rheaders, stream)
    668 try:
--> 669     data = json.loads(rbody)
    670 except (JSONDecodeError, UnicodeDecodeError) as e:

File /usr/local/lib/python3.9/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:

File /usr/local/lib/python3.9/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
    333 """Return the Python representation of ``s`` (a ``str`` instance
    334 containing a JSON document).
    335 
    336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338 end = _w(s, end).end()

File /usr/local/lib/python3.9/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

The above exception was the direct cause of the following exception:

APIError                                  Traceback (most recent call last)
Cell In[13], line 7
      2 save_dir="docs/youtube/"
      3 loader = GenericLoader(
      4     YoutubeAudioLoader([url],save_dir),
      5     OpenAIWhisperParser()
      6 )
----> 7 docs = loader.load()

File /usr/local/lib/python3.9/site-packages/langchain/document_loaders/generic.py:90, in GenericLoader.load(self)
     88 def load(self) -> List[Document]:
     89     """Load all documents."""
---> 90     return list(self.lazy_load())

File /usr/local/lib/python3.9/site-packages/langchain/document_loaders/generic.py:86, in GenericLoader.lazy_load(self)
     84 """Load documents lazily. Use this when working at a large scale."""
     85 for blob in self.blob_loader.yield_blobs():
---> 86     yield from self.blob_parser.lazy_parse(blob)

File /usr/local/lib/python3.9/site-packages/langchain/document_loaders/parsers/audio.py:51, in OpenAIWhisperParser.lazy_parse(self, blob)
     49 # Transcribe
     50 print(f"Transcribing part {split_number+1}!")
---> 51 transcript = openai.Audio.transcribe("whisper-1", file_obj)
     53 yield Document(
     54     page_content=transcript.text,
     55     metadata={"source": blob.source, "chunk": split_number},
     56 )

File /usr/local/lib/python3.9/site-packages/openai/api_resources/audio.py:57, in Audio.transcribe(cls, model, file, api_key, api_base, api_type, api_version, organization, **params)
     55 requestor, files, data = cls._prepare_request(file, file.name, model, **params)
     56 url = cls._get_url("transcriptions")
---> 57 response, _, api_key = requestor.request("post", url, files=files, params=data)
     58 return util.convert_to_openai_object(
     59     response, api_key, api_version, organization
     60 )

File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:226, in APIRequestor.request(self, method, url, params, headers, files, stream, request_id, request_timeout)
    205 def request(
    206     self,
    207     method,
   (...)
    214     request_timeout: Optional[Union[float, Tuple[float, float]]] = None,
    215 ) -> Tuple[Union[OpenAIResponse, Iterator[OpenAIResponse]], bool, str]:
    216     result = self.request_raw(
    217         method.lower(),
    218         url,
   (...)
    224         request_timeout=request_timeout,
    225     )
--> 226     resp, got_stream = self._interpret_response(result, stream)
    227     return resp, got_stream, self.api_key

File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:619, in APIRequestor._interpret_response(self, result, stream)
    611     return (
    612         self._interpret_response_line(
    613             line, result.status_code, result.headers, stream=True
    614         )
    615         for line in parse_stream(result.iter_lines())
    616     ), True
    617 else:
    618     return (
--> 619         self._interpret_response_line(
    620             result.content.decode("utf-8"),
    621             result.status_code,
    622             result.headers,
    623             stream=False,
    624         ),
    625         False,
    626     )

File /usr/local/lib/python3.9/site-packages/openai/api_requestor.py:671, in APIRequestor._interpret_response_line(self, rbody, rcode, rheaders, stream)
    669     data = json.loads(rbody)
    670 except (JSONDecodeError, UnicodeDecodeError) as e:
--> 671     raise error.APIError(
    672         f"HTTP code {rcode} from API ({rbody})", rbody, rcode, headers=rheaders
    673     ) from e
    674 resp = OpenAIResponse(data, rheaders)
    675 # In the future, we might add a "status" parameter to errors
    676 # to better handle the "error while streaming" case.

APIError: HTTP code 504 from API (<html>
<head><title>504 Gateway Time-out</title></head>
<body>
<center><h1>504 Gateway Time-out</h1></center>
</body>
</html>
)


docs[0].page_content[0:500]


from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")


docs = loader.load()


print(docs[0].page_content[:500])



handbook/37signals-is-you.md at master · basecamp/handbook · GitHub


Skip to content


Toggle navigation


            Sign up
          

        Product
        

Actions
        Automate any workflow
      

Packages
        Host and manage packages
      

Security
        Find and fix vulnerabilities
      

Code


from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()


print(docs[0].page_content[0:200])

# Blendle's Employee Handbook

This is a living document with everything we've learned working with people while running a startup. And, of course, we continue to learn. Therefore it's a document that


docs[0].metadata

{'source': "docs/Notion_DB/Blendle's Employee Handbook e367aa77e225482c849111687e114a56.md"}

Document Loading¶

Note to students.¶

Retrieval augmented generation¶

PDFs¶

YouTube¶

There seems to be some issues with the API at the moment. This section will be updated in future.¶

URLs¶

Notion¶