The New Computer Update Part II

KillianLucas · KillianLucas · commit d9829b540085 · 2024-03-11T16:06:43.000-07:00
diff --git a/interpreter/core/computer/browser/browser.py b/interpreter/core/computer/browser/browser.py
@@ -10,6 +10,7 @@ def search(self, query):
         Searches the web for the specified query and returns the results.
         """
         response = requests.get(
-            f'{self.computer.api_base.strip("/")}/browser/search', params={"q": query}
+            f'{self.computer.api_base.strip("/")}/browser/search',
+            params={"query": query},
         )
         return response.json()["result"]
diff --git a/interpreter/core/computer/computer.py b/interpreter/core/computer/computer.py
@@ -43,7 +43,6 @@ def __init__(self, interpreter):
         self.emit_images = True
         self.api_base = "https://api.openinterpreter.com/v0"
         self.save_skills = True
-        # self.api_base = "http://0.0.0.0/v0"
 
         self.import_computer_api = True
         self._has_imported_computer_api = False  # Because we only want to do this once
diff --git a/interpreter/core/computer/display/display.py b/interpreter/core/computer/display/display.py
@@ -1,5 +1,6 @@
 import base64
 import os
+import platform
 import pprint
 import time
 import warnings
@@ -64,12 +65,13 @@ def view(self, show=True, quadrant=None):
     # def get_active_window(self):
     #     return get_active_window()
 
-    def screenshot(self, show=True, quadrant=None, active_app_only=False):
+    def screenshot(
+        self, show=True, quadrant=None, active_app_only=False, force_image=False
+    ):
         """
         Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
         """
-        time.sleep(2)
-        if not self.computer.emit_images:
+        if not self.computer.emit_images and force_image == False:
             text = self.get_text_as_list_of_lists()
             pp = pprint.PrettyPrinter(indent=4)
             pretty_text = pp.pformat(text)  # language models like it pretty!
@@ -89,7 +91,10 @@ def screenshot(self, show=True, quadrant=None, active_app_only=False):
                 region = self.get_active_window()["region"]
                 screenshot = pyautogui.screenshot(region=region)
             else:
-                screenshot = pyautogui.screenshot()
+                if platform.system() == "Darwin":
+                    screenshot = take_screenshot_to_pil()
+                else:
+                    screenshot = pyautogui.screenshot()
                 # message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
                 # print(message)
 
@@ -139,11 +144,14 @@ def find(self, description, screenshot=None):
                     print("NUM HASHES:", len(self._hashes))
                 else:
                     message = format_to_recipient(
-                        "Locating this icon will take ~10 seconds. Subsequent icons should be found more quickly.",
+                        "Locating this icon will take ~15 seconds. Subsequent icons should be found more quickly.",
                         recipient="user",
                     )
                     print(message)
 
+                if len(self._hashes) > 5000:
+                    self._hashes = dict(list(self._hashes.items())[-5000:])
+
                 from .point.point import point
 
                 result = point(
@@ -251,3 +259,24 @@ def get_text_as_list_of_lists(self, screenshot=None):
                 raise Exception(
                     "Failed to find text locally.\n\nTo find text in order to use the mouse, please make sure you've installed `pytesseract` along with the Tesseract executable (see this Stack Overflow answer for help installing Tesseract: https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i)."
                 )
+
+
+import io
+import subprocess
+
+from PIL import Image
+
+
+def take_screenshot_to_pil(filename="temp_screenshot.png"):
+    # Capture the screenshot and save it to a temporary file
+    subprocess.run(["screencapture", "-x", filename], check=True)
+
+    # Open the image file with PIL
+    with open(filename, "rb") as f:
+        image_data = f.read()
+    image = Image.open(io.BytesIO(image_data))
+
+    # Optionally, delete the temporary file if you don't need it after loading
+    os.remove(filename)
+
+    return image
diff --git a/interpreter/core/computer/display/point/point.py b/interpreter/core/computer/display/point/point.py
@@ -50,6 +50,8 @@ def point(description, screenshot=None, debug=False, hashes=None):
 
 
 def find_icon(description, screenshot=None, debug=False, hashes=None):
+    if debug:
+        print("STARTING")
     if screenshot == None:
         image_data = take_screenshot_to_pil()
     else:
@@ -68,6 +70,9 @@ def find_icon(description, screenshot=None, debug=False, hashes=None):
 
     icons_bounding_boxes = get_element_boxes(image_data, debug)
 
+    if debug:
+        print("GOT ICON BOUNDING BOXES")
+
     debug_path = os.path.join(os.path.expanduser("~"), "Desktop", "oi-debug")
 
     if debug:
@@ -123,8 +128,14 @@ def find_icon(description, screenshot=None, debug=False, hashes=None):
 
     # # Filter out text
 
+    if debug:
+        print("GETTING TEXT")
+
     response = pytesseract_get_text_bounding_boxes(screenshot)
 
+    if debug:
+        print("GOT TEXT, processing it")
+
     if debug:
         # Create a draw object
         image_data_copy = image_data.copy()
@@ -416,7 +427,13 @@ def combine_boxes(icons_bounding_boxes):
     if "icon" not in description.lower():
         description += " icon"
 
-    top_icons = image_search(description, icons, hashes)
+    if debug:
+        print("FINALLY, SEARCHING")
+
+    top_icons = image_search(description, icons, hashes, debug)
+
+    if debug:
+        print("DONE")
 
     coordinates = [t["coordinate"] for t in top_icons]
 
@@ -478,7 +495,7 @@ def embed_images(images: List[Image.Image], model, transforms):
 model = model.to(device)
 
 
-def image_search(query, icons, hashes):
+def image_search(query, icons, hashes, debug):
     hashed_icons = [icon for icon in icons if icon["hash"] in hashes]
     unhashed_icons = [icon for icon in icons if icon["hash"] not in hashes]
 
@@ -488,7 +505,7 @@ def image_search(query, icons, hashes):
             [query] + [icon["data"] for icon in unhashed_icons],
             batch_size=128,
             convert_to_tensor=True,
-            show_progress_bar=False,
+            show_progress_bar=debug,
         )
     else:
         query_and_unhashed_icons_embeds = embed_images(
@@ -526,9 +543,10 @@ def image_search(query, icons, hashes):
 
 
 def get_element_boxes(image_data, debug):
+    desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
+    debug_path = os.path.join(desktop_path, "oi-debug")
+
     if debug:
-        desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
-        debug_path = os.path.join(desktop_path, "oi-debug")
         if not os.path.exists(debug_path):
             os.makedirs(debug_path)
 
@@ -662,6 +680,9 @@ def process_image(
             pil_image, debug=debug, debug_path=debug_path
         )
 
+    if debug:
+        print("WE HERE")
+
     # Initialize an empty list to store the boxes
     boxes = []
     for contour in contours_contrasted:
@@ -670,30 +691,41 @@ def process_image(
         # Append the box as a dictionary to the list
         boxes.append({"x": x, "y": y, "width": w, "height": h})
 
-    # Remove any boxes whose edges cross over any contours
-    filtered_boxes = []
-    for box in boxes:
-        crosses_contour = False
-        for contour in contours_contrasted:
-            if (
-                cv2.pointPolygonTest(contour, (box["x"], box["y"]), False) >= 0
-                or cv2.pointPolygonTest(
-                    contour, (box["x"] + box["width"], box["y"]), False
-                )
-                >= 0
-                or cv2.pointPolygonTest(
-                    contour, (box["x"], box["y"] + box["height"]), False
-                )
-                >= 0
-                or cv2.pointPolygonTest(
-                    contour, (box["x"] + box["width"], box["y"] + box["height"]), False
-                )
-                >= 0
-            ):
-                crosses_contour = True
-                break
-        if not crosses_contour:
-            filtered_boxes.append(box)
-    boxes = filtered_boxes
+    if debug:
+        print("WE HHERE")
+
+    if (
+        False
+    ):  # Disabled. I thought this would be faster but it's actually slower than just embedding all of them.
+        # Remove any boxes whose edges cross over any contours
+        filtered_boxes = []
+        for box in boxes:
+            crosses_contour = False
+            for contour in contours_contrasted:
+                if (
+                    cv2.pointPolygonTest(contour, (box["x"], box["y"]), False) >= 0
+                    or cv2.pointPolygonTest(
+                        contour, (box["x"] + box["width"], box["y"]), False
+                    )
+                    >= 0
+                    or cv2.pointPolygonTest(
+                        contour, (box["x"], box["y"] + box["height"]), False
+                    )
+                    >= 0
+                    or cv2.pointPolygonTest(
+                        contour,
+                        (box["x"] + box["width"], box["y"] + box["height"]),
+                        False,
+                    )
+                    >= 0
+                ):
+                    crosses_contour = True
+                    break
+            if not crosses_contour:
+                filtered_boxes.append(box)
+        boxes = filtered_boxes
+
+    if debug:
+        print("WE HHHERE")
 
     return boxes
diff --git a/interpreter/terminal_interface/profiles/defaults/os.py b/interpreter/terminal_interface/profiles/defaults/os.py
@@ -40,10 +40,13 @@
 computer.keyboard.hotkey(" ", "command") # Opens spotlight (very useful)
 computer.keyboard.write("hello")
 
+# Use this to click text:
 computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
+# Use this to click an icon, button, or other symbol:
+computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often.
+
 computer.mouse.move("open recent >") # This moves the mouse over the UI element with that text. Many dropdowns will disappear if you click them. You have to hover over items to reveal more.
 computer.mouse.click(x=500, y=500) # Use this very, very rarely. It's highly inaccurate
-computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often
 
 computer.mouse.scroll(-10) # Scrolls down. If you don't find some text on screen that you expected to be there, you probably want to do this
 x, y = computer.display.center() # Get your bearings
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "open-interpreter"
 packages = [
     {include = "interpreter"},
 ]
-version = "0.2.1-rc2" # Use "-rc1", "-rc2", etc. for pre-release versions
+version = "0.2.1" # Use "-rc1", "-rc2", etc. for pre-release versions
 description = "Let language models run code"
 authors = ["Killian Lucas <killian@openinterpreter.com>"]
 readme = "README.md"
diff --git a/tests/test_interpreter.py b/tests/test_interpreter.py
@@ -24,11 +24,12 @@
 
 @pytest.mark.skip(reason="Computer with display only + no way to fail test")
 def test_point():
-    interpreter.offline = True
     # interpreter.computer.debug = True
     interpreter.computer.mouse.move(icon="gear")
     interpreter.computer.mouse.move(icon="refresh")
-    # interpreter.computer.mouse.move("Spaces:")
+    interpreter.computer.mouse.move(icon="play")
+    interpreter.computer.mouse.move(icon="magnifying glass")
+    interpreter.computer.mouse.move("Spaces:")
     assert False
 
 
@@ -62,6 +63,15 @@ def test_skills():
     assert "testing_skilsl" in str(output)
 
 
+@pytest.mark.skip(reason="Local only")
+def test_browser():
+    interpreter.computer.api_base = "http://0.0.0.0:80/v0"
+    print(
+        interpreter.computer.browser.search("When's the next Dune showing in Seattle?")
+    )
+    assert False
+
+
 @pytest.mark.skip(reason="Computer with display only + no way to fail test")
 def test_display_api():
     start = time.time()

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ def search(self, query):`
`10`	`10`	`Searches the web for the specified query and returns the results.`
`11`	`11`	`"""`
`12`	`12`	`response = requests.get(`
`13`		`- f'{self.computer.api_base.strip("/")}/browser/search', params={"q": query}`
	`13`	`+ f'{self.computer.api_base.strip("/")}/browser/search',`
	`14`	`+ params={"query": query},`
`14`	`15`	`)`
`15`	`16`	`return response.json()["result"]`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ name = "open-interpreter"`
`3`	`3`	`packages = [`
`4`	`4`	`{include = "interpreter"},`
`5`	`5`	`]`
`6`		`-version = "0.2.1-rc2" # Use "-rc1", "-rc2", etc. for pre-release versions`
	`6`	`+version = "0.2.1" # Use "-rc1", "-rc2", etc. for pre-release versions`
`7`	`7`	`description = "Let language models run code"`
`8`	`8`	`authors = ["Killian Lucas <[email protected]>"]`
`9`	`9`	`readme = "README.md"`